text-generation-inference/router/src/vertex.rs

use crate::infer::Infer;
use crate::server::{generate_internal, ComputeType};
use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest};
use axum::extract::Extension;
use axum::http::{HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::Json;
use serde::{Deserialize, Serialize};
use tracing::instrument;
use utoipa::ToSchema;

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct GenerateVertexInstance {
    #[schema(example = "What is Deep Learning?")]
    pub inputs: String,
    #[schema(nullable = true, default = "null", example = "null")]
    pub parameters: Option<GenerateParameters>,
}

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
#[serde(untagged)]
pub(crate) enum VertexInstance {
    Generate(GenerateVertexInstance),
    Chat(ChatRequest),
}

#[derive(Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct VertexRequest {
    #[serde(rename = "instances")]
    pub instances: Vec<VertexInstance>,
}

#[derive(Clone, Deserialize, ToSchema, Serialize)]
pub(crate) struct VertexResponse {
    pub predictions: Vec<String>,
}

/// Generate tokens from Vertex request
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/vertex",
request_body = VertexRequest,
responses(
(status = 200, description = "Generated Text", body = VertexResponse),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})),
)
)]
#[instrument(
    skip_all,
    fields(
        total_time,
        validation_time,
        queue_time,
        inference_time,
        time_per_token,
        seed,
    )
)]
pub(crate) async fn vertex_compatibility(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Json(req): Json<VertexRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    metrics::counter!("tgi_request_count").increment(1);

    // check that theres at least one instance
    if req.instances.is_empty() {
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Input validation error".to_string(),
                error_type: "Input validation error".to_string(),
            }),
        ));
    }

    // Prepare futures for all instances
    let mut futures = Vec::with_capacity(req.instances.len());

    for instance in req.instances.into_iter() {
        let generate_request = match instance {
            VertexInstance::Generate(instance) => GenerateRequest {
                inputs: instance.inputs.clone(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    do_sample: true,
                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
                    details: true,
                    decoder_input_details: true,
                    ..Default::default()
                },
            },
            VertexInstance::Chat(instance) => {
                let (generate_request, _using_tools): (GenerateRequest, bool) =
                    instance.try_into_generate(&infer)?;
                generate_request
            }
        };

        let infer_clone = infer.clone();
        let compute_type_clone = compute_type.clone();
        let span_clone = span.clone();

        futures.push(async move {
            generate_internal(
                Extension(infer_clone),
                compute_type_clone,
                Json(generate_request),
                span_clone,
            )
            .await
            .map(|(_, _, Json(generation))| generation.generated_text)
            .map_err(|_| {
                (
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(ErrorResponse {
                        error: "Incomplete generation".into(),
                        error_type: "Incomplete generation".into(),
                    }),
                )
            })
        });
    }

    // execute all futures in parallel, collect results, returning early if any error occurs
    let results = futures::future::join_all(futures).await;
    let predictions: Result<Vec<_>, _> = results.into_iter().collect();
    let predictions = predictions?;

    let response = VertexResponse { predictions };
    Ok((HeaderMap::new(), Json(response)).into_response())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{Message, MessageBody, MessageContent};

    #[test]
    fn vertex_deserialization() {
        let string = serde_json::json!({

        "instances": [
            {
                "messages": [{"role": "user", "content": "What's Deep Learning?"}],
                "max_tokens": 128,
                "top_p": 0.95,
                "temperature": 0.7
            }
        ]

        });
        let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");
        assert_eq!(
            request,
            VertexRequest {
                instances: vec![VertexInstance::Chat(ChatRequest {
                    messages: vec![Message {
                        name: None,
                        role: "user".to_string(),
                        body: MessageBody::Content {
                            content: MessageContent::SingleText(
                                "What's Deep Learning?".to_string()
                            )
                        },
                    },],
                    max_tokens: Some(128),
                    top_p: Some(0.95),
                    temperature: Some(0.7),
                    ..Default::default()
                })]
            }
        );
    }
}
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`use crate::infer::Infer;`
			`use crate::server::{generate_internal, ComputeType};`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest};`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`use axum::extract::Extension;`
			`use axum::http::{HeaderMap, StatusCode};`
			`use axum::response::{IntoResponse, Response};`
			`use axum::Json;`
			`use serde::{Deserialize, Serialize};`
			`use tracing::instrument;`
			`use utoipa::ToSchema;`

			`#[derive(Clone, Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct GenerateVertexInstance {`
			`#[schema(example = "What is Deep Learning?")]`
			`pub inputs: String,`
			`#[schema(nullable = true, default = "null", example = "null")]`
			`pub parameters: Option<GenerateParameters>,`
			`}`

			`#[derive(Clone, Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`#[serde(untagged)]`
			`pub(crate) enum VertexInstance {`
			`Generate(GenerateVertexInstance),`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`Chat(ChatRequest),`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`}`

			`#[derive(Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct VertexRequest {`
			`#[serde(rename = "instances")]`
			`pub instances: Vec<VertexInstance>,`
			`}`

			`#[derive(Clone, Deserialize, ToSchema, Serialize)]`
			`pub(crate) struct VertexResponse {`
			`pub predictions: Vec<String>,`
			`}`

			`/// Generate tokens from Vertex request`
			`#[utoipa::path(`
			`post,`
			`tag = "Text Generation Inference",`
			`path = "/vertex",`
			`request_body = VertexRequest,`
			`responses(`
			`(status = 200, description = "Generated Text", body = VertexResponse),`
			`(status = 424, description = "Generation Error", body = ErrorResponse,`
			`example = json ! ({"error": "Request failed during generation"})),`
			`(status = 429, description = "Model is overloaded", body = ErrorResponse,`
			`example = json ! ({"error": "Model is overloaded"})),`
			`(status = 422, description = "Input validation error", body = ErrorResponse,`
			`example = json ! ({"error": "Input validation error"})),`
			`(status = 500, description = "Incomplete generation", body = ErrorResponse,`
			`example = json ! ({"error": "Incomplete generation"})),`
			`)`
			`)]`
			`#[instrument(`
			`skip_all,`
			`fields(`
			`total_time,`
			`validation_time,`
			`queue_time,`
			`inference_time,`
			`time_per_token,`
			`seed,`
			`)`
			`)]`
			`pub(crate) async fn vertex_compatibility(`
			`Extension(infer): Extension<Infer>,`
			`Extension(compute_type): Extension<ComputeType>,`
			`Json(req): Json<VertexRequest>,`
			`) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {`
			`let span = tracing::Span::current();`
			`metrics::counter!("tgi_request_count").increment(1);`

			`// check that theres at least one instance`
			`if req.instances.is_empty() {`
			`return Err((`
			`StatusCode::UNPROCESSABLE_ENTITY,`
			`Json(ErrorResponse {`
			`error: "Input validation error".to_string(),`
			`error_type: "Input validation error".to_string(),`
			`}),`
			`));`
			`}`

			`// Prepare futures for all instances`
			`let mut futures = Vec::with_capacity(req.instances.len());`

			`for instance in req.instances.into_iter() {`
			`let generate_request = match instance {`
			`VertexInstance::Generate(instance) => GenerateRequest {`
			`inputs: instance.inputs.clone(),`
			`add_special_tokens: true,`
			`parameters: GenerateParameters {`
			`do_sample: true,`
			`max_new_tokens: instance.parameters.as_ref().and_then(\|p\| p.max_new_tokens),`
			`seed: instance.parameters.as_ref().and_then(\|p\| p.seed),`
			`details: true,`
			`decoder_input_details: true,`
			`..Default::default()`
			`},`
			`},`
			`VertexInstance::Chat(instance) => {`
			`let (generate_request, _using_tools): (GenerateRequest, bool) =`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`instance.try_into_generate(&infer)?;`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`generate_request`
			`}`
			`};`

			`let infer_clone = infer.clone();`
			`let compute_type_clone = compute_type.clone();`
			`let span_clone = span.clone();`

			`futures.push(async move {`
			`generate_internal(`
			`Extension(infer_clone),`
			`compute_type_clone,`
			`Json(generate_request),`
			`span_clone,`
			`)`
			`.await`
Auto max prefill (#2797) * Attempt at automatic max batch prefill. * Taking into account number of shards. * Adding more cards. * Adding A100 + H100 * Adding a few more cards. * Logprobs cost too much. * h100 better name, and keep factor of 2 * Damn inflated sparse tflops. * Typo in h100. * Updated the flops calculation (checked with fvcore). * chunking by default. * Fix prefix caching for chat completion since we removed logprobs. * More tests. * Dropping all the prefill logprobs. * Add a flag that enables users to get logprobs back. * Repairing prompt token counting. * Fixing a few tests. * Remove some scaffolding. * Attempting to reduces the issues (workarounds for now). 2024-12-06 04:52:00 +00:00			`.map(\|(_, _, Json(generation))\| generation.generated_text)`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`.map_err(\|_\| {`
			`(`
			`StatusCode::INTERNAL_SERVER_ERROR,`
			`Json(ErrorResponse {`
			`error: "Incomplete generation".into(),`
			`error_type: "Incomplete generation".into(),`
			`}),`
			`)`
			`})`
			`});`
			`}`

			`// execute all futures in parallel, collect results, returning early if any error occurs`
			`let results = futures::future::join_all(futures).await;`
			`let predictions: Result<Vec<_>, _> = results.into_iter().collect();`
			`let predictions = predictions?;`

			`let response = VertexResponse { predictions };`
			`Ok((HeaderMap::new(), Json(response)).into_response())`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
Improve tool call message processing (#3036) * make content field optional in chat request * add tool_calls field to Message struct * feat: add test and serialize tool messages * fix: bump utopia, openapi doc version and improve test * fix: rerun update docs * fix: suppoer tool call id in template and remove unnecessary changes * fix: ruff lint remove unused import * fix: adjust message types in tests --------- Co-authored-by: sailesh duddupudi <saileshradar@gmail.com> 2025-02-21 09:30:29 +00:00			`use crate::{Message, MessageBody, MessageContent};`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00
			`#[test]`
			`fn vertex_deserialization() {`
			`let string = serde_json::json!({`

			`"instances": [`
			`{`
			`"messages": [{"role": "user", "content": "What's Deep Learning?"}],`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`"max_tokens": 128,`
			`"top_p": 0.95,`
			`"temperature": 0.7`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`}`
			`]`

			`});`
			`let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");`
			`assert_eq!(`
			`request,`
			`VertexRequest {`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`instances: vec![VertexInstance::Chat(ChatRequest {`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`messages: vec![Message {`
			`name: None,`
Improve tool call message processing (#3036) * make content field optional in chat request * add tool_calls field to Message struct * feat: add test and serialize tool messages * fix: bump utopia, openapi doc version and improve test * fix: rerun update docs * fix: suppoer tool call id in template and remove unnecessary changes * fix: ruff lint remove unused import * fix: adjust message types in tests --------- Co-authored-by: sailesh duddupudi <saileshradar@gmail.com> 2025-02-21 09:30:29 +00:00			`role: "user".to_string(),`
			`body: MessageBody::Content {`
			`content: MessageContent::SingleText(`
			`"What's Deep Learning?".to_string()`
			`)`
			`},`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`},],`
Rollback to `ChatRequest` for Vertex AI Chat instead of `VertexChat` (#2651) As spotted by @philschmid, the payload was compliant with Vertex AI, but just partially, since ideally the most compliant version would be with the generation kwargs flattened to be on the same level as the `messages`; meaning that Vertex AI would still expect a list of instances, but each instance would be an OpenAI-compatible instance, which is more clear; and more aligned with the SageMaker integration too, so kudos to him for spotting that; and sorry from my end for any inconvenience @Narsil. 2024-10-15 16:11:59 +00:00			`max_tokens: Some(128),`
			`top_p: Some(0.95),`
			`temperature: Some(0.7),`
			`..Default::default()`
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`})]`
			`}`
			`);`
			`}`
			`}`