text-generation-inference/router/src/vertex.rs

use crate::infer::Infer;
use crate::server::{generate_internal, ComputeType};
use crate::{
    ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest, GrammarType, Message,
    StreamOptions, Tool, ToolChoice,
};
use axum::extract::Extension;
use axum::http::{HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::Json;
use serde::{Deserialize, Serialize};
use tracing::instrument;
use utoipa::ToSchema;

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct GenerateVertexInstance {
    #[schema(example = "What is Deep Learning?")]
    pub inputs: String,
    #[schema(nullable = true, default = "null", example = "null")]
    pub parameters: Option<GenerateParameters>,
}

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct VertexChat {
    messages: Vec<Message>,
    // Messages is ignored there.
    #[serde(default)]
    parameters: VertexParameters,
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct VertexParameters {
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
    pub model: Option<String>,

    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
    /// decreasing the model's likelihood to repeat the same line verbatim.
    #[serde(default)]
    #[schema(example = "1.0")]
    pub frequency_penalty: Option<f32>,

    /// UNUSED
    /// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
    /// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
    /// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
    /// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
    /// result in a ban or exclusive selection of the relevant token.
    #[serde(default)]
    pub logit_bias: Option<Vec<f32>>,

    /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
    /// output token returned in the content of message.
    #[serde(default)]
    #[schema(example = "false")]
    pub logprobs: Option<bool>,

    /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
    /// an associated log probability. logprobs must be set to true if this parameter is used.
    #[serde(default)]
    #[schema(example = "5")]
    pub top_logprobs: Option<u32>,

    /// The maximum number of tokens that can be generated in the chat completion.
    #[serde(default)]
    #[schema(example = "32")]
    pub max_tokens: Option<u32>,

    /// UNUSED
    /// How many chat completion choices to generate for each input message. Note that you will be charged based on the
    /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
    #[serde(default)]
    #[schema(nullable = true, example = "2")]
    pub n: Option<u32>,

    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
    /// increasing the model's likelihood to talk about new topics
    #[serde(default)]
    #[schema(nullable = true, example = 0.1)]
    pub presence_penalty: Option<f32>,

    /// Up to 4 sequences where the API will stop generating further tokens.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stop: Option<Vec<String>>,

    #[serde(default = "bool::default")]
    pub stream: bool,

    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,

    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
    /// lower values like 0.2 will make it more focused and deterministic.
    ///
    /// We generally recommend altering this or `top_p` but not both.
    #[serde(default)]
    #[schema(nullable = true, example = 1.0)]
    pub temperature: Option<f32>,

    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
    #[serde(default)]
    #[schema(nullable = true, example = 0.95)]
    pub top_p: Option<f32>,

    /// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of
    /// functions the model may generate JSON inputs for.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub tools: Option<Vec<Tool>>,

    /// A prompt to be appended before the tools
    #[serde(default)]
    #[schema(
        nullable = true,
        example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."
    )]
    pub tool_prompt: Option<String>,

    /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub tool_choice: ToolChoice,

    /// Response format constraints for the generation.
    ///
    /// NOTE: A request can use `response_format` OR `tools` but not both.
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub response_format: Option<GrammarType>,

    /// A guideline to be used in the chat_template
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub guideline: Option<String>,

    /// Options for streaming response. Only set this when you set stream: true.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stream_options: Option<StreamOptions>,
}

impl From<VertexChat> for ChatRequest {
    fn from(val: VertexChat) -> Self {
        Self {
            messages: val.messages,
            frequency_penalty: val.parameters.frequency_penalty,
            guideline: val.parameters.guideline,
            logit_bias: val.parameters.logit_bias,
            logprobs: val.parameters.logprobs,
            max_tokens: val.parameters.max_tokens,
            model: val.parameters.model,
            n: val.parameters.n,
            presence_penalty: val.parameters.presence_penalty,
            response_format: val.parameters.response_format,
            seed: val.parameters.seed,
            stop: val.parameters.stop,
            stream_options: val.parameters.stream_options,
            stream: val.parameters.stream,
            temperature: val.parameters.temperature,
            tool_choice: val.parameters.tool_choice,
            tool_prompt: val.parameters.tool_prompt,
            tools: val.parameters.tools,
            top_logprobs: val.parameters.top_logprobs,
            top_p: val.parameters.top_p,
        }
    }
}

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
#[serde(untagged)]
pub(crate) enum VertexInstance {
    Generate(GenerateVertexInstance),
    Chat(VertexChat),
}

#[derive(Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct VertexRequest {
    #[serde(rename = "instances")]
    pub instances: Vec<VertexInstance>,
}

#[derive(Clone, Deserialize, ToSchema, Serialize)]
pub(crate) struct VertexResponse {
    pub predictions: Vec<String>,
}

/// Generate tokens from Vertex request
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/vertex",
request_body = VertexRequest,
responses(
(status = 200, description = "Generated Text", body = VertexResponse),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})),
)
)]
#[instrument(
    skip_all,
    fields(
        total_time,
        validation_time,
        queue_time,
        inference_time,
        time_per_token,
        seed,
    )
)]
pub(crate) async fn vertex_compatibility(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Json(req): Json<VertexRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    metrics::counter!("tgi_request_count").increment(1);

    // check that theres at least one instance
    if req.instances.is_empty() {
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Input validation error".to_string(),
                error_type: "Input validation error".to_string(),
            }),
        ));
    }

    // Prepare futures for all instances
    let mut futures = Vec::with_capacity(req.instances.len());

    for instance in req.instances.into_iter() {
        let generate_request = match instance {
            VertexInstance::Generate(instance) => GenerateRequest {
                inputs: instance.inputs.clone(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    do_sample: true,
                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
                    details: true,
                    decoder_input_details: true,
                    ..Default::default()
                },
            },
            VertexInstance::Chat(instance) => {
                let chat_request: ChatRequest = instance.into();
                let (generate_request, _using_tools): (GenerateRequest, bool) =
                    chat_request.try_into_generate(&infer)?;
                generate_request
            }
        };

        let infer_clone = infer.clone();
        let compute_type_clone = compute_type.clone();
        let span_clone = span.clone();

        futures.push(async move {
            generate_internal(
                Extension(infer_clone),
                compute_type_clone,
                Json(generate_request),
                span_clone,
            )
            .await
            .map(|(_, Json(generation))| generation.generated_text)
            .map_err(|_| {
                (
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(ErrorResponse {
                        error: "Incomplete generation".into(),
                        error_type: "Incomplete generation".into(),
                    }),
                )
            })
        });
    }

    // execute all futures in parallel, collect results, returning early if any error occurs
    let results = futures::future::join_all(futures).await;
    let predictions: Result<Vec<_>, _> = results.into_iter().collect();
    let predictions = predictions?;

    let response = VertexResponse { predictions };
    Ok((HeaderMap::new(), Json(response)).into_response())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{Message, MessageContent};

    #[test]
    fn vertex_deserialization() {
        let string = serde_json::json!({

            "messages": [{"role": "user", "content": "What's Deep Learning?"}],
            "parameters": {
                "max_tokens": 128,
                "top_p": 0.95,
                "temperature": 0.7
            }
        });

        let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");

        let string = serde_json::json!({
            "messages": [{"role": "user", "content": "What's Deep Learning?"}],
        });

        let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");

        let string = serde_json::json!({

        "instances": [
            {
                "messages": [{"role": "user", "content": "What's Deep Learning?"}],
                "parameters": {
                    "max_tokens": 128,
                    "top_p": 0.95,
                    "temperature": 0.7
                }
            }
        ]

        });
        let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");
        assert_eq!(
            request,
            VertexRequest {
                instances: vec![VertexInstance::Chat(VertexChat {
                    messages: vec![Message {
                        role: "user".to_string(),
                        content: MessageContent::SingleText("What's Deep Learning?".to_string()),
                        name: None,
                    },],
                    parameters: VertexParameters {
                        max_tokens: Some(128),
                        top_p: Some(0.95),
                        temperature: Some(0.7),
                        ..Default::default()
                    }
                })]
            }
        );
    }
}
Cleanup Vertex + Chat (#2553) * Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase. 2024-09-24 21:37:17 +00:00			`use crate::infer::Infer;`
			`use crate::server::{generate_internal, ComputeType};`
			`use crate::{`
			`ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest, GrammarType, Message,`
			`StreamOptions, Tool, ToolChoice,`
			`};`
			`use axum::extract::Extension;`
			`use axum::http::{HeaderMap, StatusCode};`
			`use axum::response::{IntoResponse, Response};`
			`use axum::Json;`
			`use serde::{Deserialize, Serialize};`
			`use tracing::instrument;`
			`use utoipa::ToSchema;`

			`#[derive(Clone, Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct GenerateVertexInstance {`
			`#[schema(example = "What is Deep Learning?")]`
			`pub inputs: String,`
			`#[schema(nullable = true, default = "null", example = "null")]`
			`pub parameters: Option<GenerateParameters>,`
			`}`

			`#[derive(Clone, Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct VertexChat {`
			`messages: Vec<Message>,`
			`// Messages is ignored there.`
			`#[serde(default)]`
			`parameters: VertexParameters,`
			`}`

			`#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct VertexParameters {`
			`#[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]`
			`/// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.`
			`pub model: Option<String>,`

			`/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,`
			`/// decreasing the model's likelihood to repeat the same line verbatim.`
			`#[serde(default)]`
			`#[schema(example = "1.0")]`
			`pub frequency_penalty: Option<f32>,`

			`/// UNUSED`
			`/// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens`
			`/// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,`
			`/// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,`
			`/// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should`
			`/// result in a ban or exclusive selection of the relevant token.`
			`#[serde(default)]`
			`pub logit_bias: Option<Vec<f32>>,`

			`/// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each`
			`/// output token returned in the content of message.`
			`#[serde(default)]`
			`#[schema(example = "false")]`
			`pub logprobs: Option<bool>,`

			`/// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with`
			`/// an associated log probability. logprobs must be set to true if this parameter is used.`
			`#[serde(default)]`
			`#[schema(example = "5")]`
			`pub top_logprobs: Option<u32>,`

			`/// The maximum number of tokens that can be generated in the chat completion.`
			`#[serde(default)]`
			`#[schema(example = "32")]`
			`pub max_tokens: Option<u32>,`

			`/// UNUSED`
			`/// How many chat completion choices to generate for each input message. Note that you will be charged based on the`
			`/// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = "2")]`
			`pub n: Option<u32>,`

			`/// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,`
			`/// increasing the model's likelihood to talk about new topics`
			`#[serde(default)]`
			`#[schema(nullable = true, example = 0.1)]`
			`pub presence_penalty: Option<f32>,`

			`/// Up to 4 sequences where the API will stop generating further tokens.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = "null")]`
			`pub stop: Option<Vec<String>>,`

			`#[serde(default = "bool::default")]`
			`pub stream: bool,`

			`#[schema(nullable = true, example = 42)]`
			`pub seed: Option<u64>,`

			`/// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while`
			`/// lower values like 0.2 will make it more focused and deterministic.`
			`///`
			/// We generally recommend altering this or `top_p` but not both.
			`#[serde(default)]`
			`#[schema(nullable = true, example = 1.0)]`
			`pub temperature: Option<f32>,`

			`/// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the`
			`/// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = 0.95)]`
			`pub top_p: Option<f32>,`

			`/// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of`
			`/// functions the model may generate JSON inputs for.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = "null")]`
			`pub tools: Option<Vec<Tool>>,`

			`/// A prompt to be appended before the tools`
			`#[serde(default)]`
			`#[schema(`
			`nullable = true,`
			`example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."`
			`)]`
			`pub tool_prompt: Option<String>,`

			`/// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = "null")]`
			`pub tool_choice: ToolChoice,`

			`/// Response format constraints for the generation.`
			`///`
			/// NOTE: A request can use `response_format` OR `tools` but not both.
			`#[serde(default)]`
			`#[schema(nullable = true, default = "null", example = "null")]`
			`pub response_format: Option<GrammarType>,`

			`/// A guideline to be used in the chat_template`
			`#[serde(default)]`
			`#[schema(nullable = true, default = "null", example = "null")]`
			`pub guideline: Option<String>,`

			`/// Options for streaming response. Only set this when you set stream: true.`
			`#[serde(default)]`
			`#[schema(nullable = true, example = "null")]`
			`pub stream_options: Option<StreamOptions>,`
			`}`

			`impl From<VertexChat> for ChatRequest {`
			`fn from(val: VertexChat) -> Self {`
			`Self {`
			`messages: val.messages,`
			`frequency_penalty: val.parameters.frequency_penalty,`
			`guideline: val.parameters.guideline,`
			`logit_bias: val.parameters.logit_bias,`
			`logprobs: val.parameters.logprobs,`
			`max_tokens: val.parameters.max_tokens,`
			`model: val.parameters.model,`
			`n: val.parameters.n,`
			`presence_penalty: val.parameters.presence_penalty,`
			`response_format: val.parameters.response_format,`
			`seed: val.parameters.seed,`
			`stop: val.parameters.stop,`
			`stream_options: val.parameters.stream_options,`
			`stream: val.parameters.stream,`
			`temperature: val.parameters.temperature,`
			`tool_choice: val.parameters.tool_choice,`
			`tool_prompt: val.parameters.tool_prompt,`
			`tools: val.parameters.tools,`
			`top_logprobs: val.parameters.top_logprobs,`
			`top_p: val.parameters.top_p,`
			`}`
			`}`
			`}`

			`#[derive(Clone, Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`#[serde(untagged)]`
			`pub(crate) enum VertexInstance {`
			`Generate(GenerateVertexInstance),`
			`Chat(VertexChat),`
			`}`

			`#[derive(Deserialize, ToSchema)]`
			`#[cfg_attr(test, derive(Debug, PartialEq))]`
			`pub(crate) struct VertexRequest {`
			`#[serde(rename = "instances")]`
			`pub instances: Vec<VertexInstance>,`
			`}`

			`#[derive(Clone, Deserialize, ToSchema, Serialize)]`
			`pub(crate) struct VertexResponse {`
			`pub predictions: Vec<String>,`
			`}`

			`/// Generate tokens from Vertex request`
			`#[utoipa::path(`
			`post,`
			`tag = "Text Generation Inference",`
			`path = "/vertex",`
			`request_body = VertexRequest,`
			`responses(`
			`(status = 200, description = "Generated Text", body = VertexResponse),`
			`(status = 424, description = "Generation Error", body = ErrorResponse,`
			`example = json ! ({"error": "Request failed during generation"})),`
			`(status = 429, description = "Model is overloaded", body = ErrorResponse,`
			`example = json ! ({"error": "Model is overloaded"})),`
			`(status = 422, description = "Input validation error", body = ErrorResponse,`
			`example = json ! ({"error": "Input validation error"})),`
			`(status = 500, description = "Incomplete generation", body = ErrorResponse,`
			`example = json ! ({"error": "Incomplete generation"})),`
			`)`
			`)]`
			`#[instrument(`
			`skip_all,`
			`fields(`
			`total_time,`
			`validation_time,`
			`queue_time,`
			`inference_time,`
			`time_per_token,`
			`seed,`
			`)`
			`)]`
			`pub(crate) async fn vertex_compatibility(`
			`Extension(infer): Extension<Infer>,`
			`Extension(compute_type): Extension<ComputeType>,`
			`Json(req): Json<VertexRequest>,`
			`) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {`
			`let span = tracing::Span::current();`
			`metrics::counter!("tgi_request_count").increment(1);`

			`// check that theres at least one instance`
			`if req.instances.is_empty() {`
			`return Err((`
			`StatusCode::UNPROCESSABLE_ENTITY,`
			`Json(ErrorResponse {`
			`error: "Input validation error".to_string(),`
			`error_type: "Input validation error".to_string(),`
			`}),`
			`));`
			`}`

			`// Prepare futures for all instances`
			`let mut futures = Vec::with_capacity(req.instances.len());`

			`for instance in req.instances.into_iter() {`
			`let generate_request = match instance {`
			`VertexInstance::Generate(instance) => GenerateRequest {`
			`inputs: instance.inputs.clone(),`
			`add_special_tokens: true,`
			`parameters: GenerateParameters {`
			`do_sample: true,`
			`max_new_tokens: instance.parameters.as_ref().and_then(\|p\| p.max_new_tokens),`
			`seed: instance.parameters.as_ref().and_then(\|p\| p.seed),`
			`details: true,`
			`decoder_input_details: true,`
			`..Default::default()`
			`},`
			`},`
			`VertexInstance::Chat(instance) => {`
			`let chat_request: ChatRequest = instance.into();`
			`let (generate_request, _using_tools): (GenerateRequest, bool) =`
			`chat_request.try_into_generate(&infer)?;`
			`generate_request`
			`}`
			`};`

			`let infer_clone = infer.clone();`
			`let compute_type_clone = compute_type.clone();`
			`let span_clone = span.clone();`

			`futures.push(async move {`
			`generate_internal(`
			`Extension(infer_clone),`
			`compute_type_clone,`
			`Json(generate_request),`
			`span_clone,`
			`)`
			`.await`
			`.map(\|(_, Json(generation))\| generation.generated_text)`
			`.map_err(\|_\| {`
			`(`
			`StatusCode::INTERNAL_SERVER_ERROR,`
			`Json(ErrorResponse {`
			`error: "Incomplete generation".into(),`
			`error_type: "Incomplete generation".into(),`
			`}),`
			`)`
			`})`
			`});`
			`}`

			`// execute all futures in parallel, collect results, returning early if any error occurs`
			`let results = futures::future::join_all(futures).await;`
			`let predictions: Result<Vec<_>, _> = results.into_iter().collect();`
			`let predictions = predictions?;`

			`let response = VertexResponse { predictions };`
			`Ok((HeaderMap::new(), Json(response)).into_response())`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use crate::{Message, MessageContent};`

			`#[test]`
			`fn vertex_deserialization() {`
			`let string = serde_json::json!({`

			`"messages": [{"role": "user", "content": "What's Deep Learning?"}],`
			`"parameters": {`
			`"max_tokens": 128,`
			`"top_p": 0.95,`
			`"temperature": 0.7`
			`}`
			`});`

			`let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");`

			`let string = serde_json::json!({`
			`"messages": [{"role": "user", "content": "What's Deep Learning?"}],`
			`});`

			`let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");`

			`let string = serde_json::json!({`

			`"instances": [`
			`{`
			`"messages": [{"role": "user", "content": "What's Deep Learning?"}],`
			`"parameters": {`
			`"max_tokens": 128,`
			`"top_p": 0.95,`
			`"temperature": 0.7`
			`}`
			`}`
			`]`

			`});`
			`let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");`
			`assert_eq!(`
			`request,`
			`VertexRequest {`
			`instances: vec![VertexInstance::Chat(VertexChat {`
			`messages: vec![Message {`
			`role: "user".to_string(),`
			`content: MessageContent::SingleText("What's Deep Learning?".to_string()),`
			`name: None,`
			`},],`
			`parameters: VertexParameters {`
			`max_tokens: Some(128),`
			`top_p: Some(0.95),`
			`temperature: Some(0.7),`
			`..Default::default()`
			`}`
			`})]`
			`}`
			`);`
			`}`
			`}`