feat: add guideline to chat request and template

2025-09-12 04:44:52 +00:00 · 2024-08-09 13:53:47 +00:00 · 2024-08-09 13:53:47 +00:00 · 3b25cd3213
commit 3b25cd3213
parent 6e127dcc96
6 changed files with 20 additions and 6 deletions
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -2080,4 +2080,4 @@
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
-}
+}
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -9,7 +9,7 @@ We recommend using the official quantization scripts for creating your quants:
 2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
 3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)

-For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest. 
+For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.

 ## Quantization with bitsandbytes, EETQ & fp8

@ -69,4 +69,4 @@ text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-
 You can learn more about the quantization options by running `text-generation-server quantize --help`.

 If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
-You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
+You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@ -48,6 +48,7 @@ impl ChatTemplate {

    pub(crate) fn apply(
        &self,
+        guideline: Option<&str>,
        mut messages: Vec<Message>,
        grammar_with_prompt: Option<(GrammarType, String)>,
    ) -> Result<String, InferError> {
@ -65,6 +66,7 @@ impl ChatTemplate {

        self.template
            .render(ChatTemplateInputs {
+                guideline,
                messages,
                bos_token: self.bos_token.as_deref(),
                eos_token: self.eos_token.as_deref(),
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -138,13 +138,14 @@ impl Infer {
    #[instrument(skip_all)]
    pub(crate) fn apply_chat_template(
        &self,
+        guideline: Option<String>,
        messages: Vec<Message>,
        grammar_with_prompt: Option<(GrammarType, String)>,
    ) -> Result<String, InferError> {
        self.chat_template
            .as_ref()
            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
-            .apply(messages, grammar_with_prompt)
+            .apply(guideline.as_deref(), messages, grammar_with_prompt)
            .map_err(|e| {
                metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
                tracing::error!("{e}");
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -829,6 +829,11 @@ pub(crate) struct ChatRequest {
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub response_format: Option<GrammarType>,
+
+    /// A guideline to be used in the chat_template
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub guideline: Option<String>,
 }

 fn default_tool_prompt() -> Option<String> {
@ -936,6 +941,7 @@ pub(crate) struct ChatTemplateInputs<'a> {
    add_generation_prompt: bool,
    tools: Option<&'a str>,
    tools_prompt: Option<&'a str>,
+    guideline: Option<&'a str>,
 }

 #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -141,6 +141,7 @@ async fn get_chat_tokenize(
        tool_prompt,
        temperature,
        response_format,
+        guideline,
        ..
    } = req;

@ -151,6 +152,7 @@ async fn get_chat_tokenize(
        tools,
        tool_choice,
        &tool_prompt,
+        guideline,
        messages,
    )?;

@ -1123,6 +1125,7 @@ async fn chat_completions(
        tool_prompt,
        temperature,
        response_format,
+        guideline,
        ..
    } = req;

@ -1142,6 +1145,7 @@ async fn chat_completions(
        tools,
        tool_choice,
        &tool_prompt,
+        guideline,
        messages,
    )?;

@ -2402,6 +2406,7 @@ fn prepare_chat_input(
    tools: Option<Vec<Tool>>,
    tool_choice: ToolChoice,
    tool_prompt: &str,
+    guideline: Option<String>,
    messages: Vec<Message>,
 ) -> Result<PreparedInput, InferError> {
    if response_format.is_some() && tools.is_some() {
@ -2411,7 +2416,7 @@ fn prepare_chat_input(
    }

    if let Some(format) = response_format {
-        let inputs = infer.apply_chat_template(messages, None)?;
+        let inputs = infer.apply_chat_template(guideline, messages, None)?;
        return Ok((inputs, Some(format), None));
    }

@ -2423,6 +2428,6 @@ fn prepare_chat_input(
    let tools_grammar_prompt = tool_grammar
        .as_ref()
        .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt.into()));
-    let inputs = infer.apply_chat_template(messages, tools_grammar_prompt)?;
+    let inputs = infer.apply_chat_template(guideline, messages, tools_grammar_prompt)?;
    Ok((inputs, grammar, tool_grammar))
 }