feat: support vertex api

2025-09-11 12:24:53 +00:00 · 2024-01-16 15:05:44 -05:00 · 2024-01-16 15:05:44 -05:00 · f4fd89b224
commit f4fd89b224
parent 4139054b82
3 changed files with 144 additions and 2 deletions
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -20,6 +20,24 @@ pub(crate) type GenerateStreamResponse = (
    UnboundedReceiverStream<Result<InferStreamResponse, InferError>>,
 );
 #[derive(Clone, Deserialize, ToSchema)]
 pub(crate) struct Instance {
    pub inputs: String,
    pub parameters: Option<GenerateParameters>,
 }
 #[derive(Deserialize, ToSchema)]
 pub(crate) struct VertexRequest {
    pub instances: Vec<Instance>,
    #[allow(dead_code)]
    pub parameters: Option<GenerateParameters>,
 }
 #[derive(Clone, Deserialize, ToSchema, Serialize)]
 pub(crate) struct VertexResponse {
    pub predictions: Vec<String>,
 }
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
@ -153,7 +171,7 @@ pub struct Info {
    pub docker_label: Option<&'static str>,
 }
-#[derive(Clone, Debug, Deserialize, ToSchema)]
+#[derive(Clone, Debug, Deserialize, ToSchema, Default)]
 pub(crate) struct GenerateParameters {
    #[serde(default)]
    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 1)]
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -21,6 +21,36 @@ use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::{EnvFilter, Layer};
 #[allow(dead_code)] // many of the fields are not used
 #[derive(Debug)]
 struct VertexAIConfig {
    aip_http_port: u16,
    aip_predict_route: String,
    aip_health_route: String,
 }
 impl VertexAIConfig {
    fn new(aip_http_port: u16, aip_predict_route: String, aip_health_route: String) -> Self {
        Self {
            aip_http_port,
            aip_predict_route,
            aip_health_route,
        }
    }
    fn to_env(&self) {
        // NOTE: this will only set the values for this process
        // NOTE: child processes cannot set env vars for their parents
        // TODO: find a way to set the values for the whole system
        // - maybe write to a file
        // - maybe use a shell script to set the values
        // - maybe these values are set upstream (before this process is started)
        //  - if set upstream maybe we read in; if we need them?
        std::env::set_var("AIP_HTTP_PORT", self.aip_http_port.to_string());
        std::env::set_var("AIP_PREDICT_ROUTE", self.aip_predict_route.clone());
        std::env::set_var("AIP_HEALTH_ROUTE", self.aip_health_route.clone());
    }
 }
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
@ -113,6 +143,11 @@ async fn main() -> Result<(), RouterError> {
        disable_grammar_support,
    } = args;
    // Set Vertex AI config and update the env
    let vertex_ai_config =
        VertexAIConfig::new(args.port, "/vertex".to_string(), "/health".to_string());
    vertex_ai_config.to_env();
    // Launch Tokio runtime
    init_logging(otlp_endpoint, json_output);
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -7,7 +7,7 @@ use crate::{
    ChatCompletionLogprobs, ChatRequest, CompatGenerateRequest, Details, ErrorResponse,
    FinishReason, GenerateParameters, GenerateRequest, GenerateResponse, HubModelInfo,
    HubTokenizerConfig, Infer, Info, Message, PrefillToken, SimpleToken, StreamDetails,
-    StreamResponse, Token, TokenizeResponse, Validation,
+    StreamResponse, Token, TokenizeResponse, Validation, VertexRequest, VertexResponse,
 };
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
@ -16,8 +16,10 @@ use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use axum::{http, Json, Router};
 use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
 use futures::stream::FuturesUnordered;
 use futures::stream::StreamExt;
 use futures::Stream;
 use futures::TryStreamExt;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use std::convert::Infallible;
 use std::net::SocketAddr;
@ -693,6 +695,92 @@ async fn chat_completions(
    }
 }
 /// Generate tokens from Vertex request
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/v1/endpoints",
    request_body = VertexRequest,
    responses(
    (status = 200, description = "Generated Text", body = VertexResponse),
    (status = 424, description = "Generation Error", body = ErrorResponse,
    example = json ! ({"error": "Request failed during generation"})),
    (status = 429, description = "Model is overloaded", body = ErrorResponse,
    example = json ! ({"error": "Model is overloaded"})),
    (status = 422, description = "Input validation error", body = ErrorResponse,
    example = json ! ({"error": "Input validation error"})),
    (status = 500, description = "Incomplete generation", body = ErrorResponse,
    example = json ! ({"error": "Incomplete generation"})),
    )
    )]
 #[instrument(
    skip_all,
    fields(
        total_time,
        validation_time,
        queue_time,
        inference_time,
        time_per_token,
        seed,
    )
 )]
 async fn vertex_compatibility(
    Extension(infer): Extension<Infer>,
    Json(req): Json<VertexRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    metrics::increment_counter!("tgi_request_count");
    // check that theres at least one instance
    if req.instances.is_empty() {
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Input validation error".to_string(),
                error_type: "Input validation error".to_string(),
            }),
        ));
    }
    // Process all instances
    let predictions = req
        .instances
        .iter()
        .map(|instance| {
            let generate_request = GenerateRequest {
                inputs: instance.inputs.clone(),
                parameters: GenerateParameters {
                    do_sample: true,
                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
                    details: true,
                    decoder_input_details: true,
                    ..Default::default()
                },
            };
            async {
                generate(Extension(infer.clone()), Json(generate_request))
                    .await
                    .map(|(_, Json(generation))| generation.generated_text)
                    .map_err(|_| {
                        (
                            StatusCode::INTERNAL_SERVER_ERROR,
                            Json(ErrorResponse {
                                error: "Incomplete generation".into(),
                                error_type: "Incomplete generation".into(),
                            }),
                        )
                    })
            }
        })
        .collect::<FuturesUnordered<_>>()
        .try_collect::<Vec<_>>()
        .await?;
    let response = VertexResponse { predictions };
    Ok((HeaderMap::new(), Json(response)).into_response())
 }
 /// Tokenize inputs
 #[utoipa::path(
    post,
@ -953,6 +1041,7 @@ pub async fn run(
        .route("/generate", post(generate))
        .route("/generate_stream", post(generate_stream))
        .route("/v1/chat/completions", post(chat_completions))
        .route("/vertex", post(vertex_compatibility))
        .route("/tokenize", post(tokenize))
        .route("/health", get(health))
        .route("/ping", get(health))