Modify the default for max_new_tokens. (#1097)

# What does this PR do? Now clients which do not specify a max_length will be implying `max_new_tokens = max_total_tokens - input_length`. This is a serious change, but which seems more in line with what users expect from standing server.   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: OlivierDehaene <olivier@huggingface.co>
2025-09-16 06:44:52 +00:00 · 2023-10-04 17:38:42 +02:00 · 2023-10-04 17:38:42 +02:00 · 6df43da0a4
commit 6df43da0a4
parent 66ce2fa7c1
3 changed files with 88 additions and 49 deletions
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -367,7 +367,7 @@
            "type": "integer",
            "format": "int32",
            "example": 1,
-            "minimum": 0.0
+            "minimum": 0
          },
          "prefill": {
            "type": "array",
@ -380,13 +380,22 @@
            "format": "int64",
            "example": 42,
            "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
@ -432,7 +441,7 @@
            "type": "integer",
            "format": "int32",
            "example": 1,
-            "minimum": 0.0
+            "minimum": 0
          },
          "prefill": {
            "type": "array",
@ -445,13 +454,22 @@
            "format": "int64",
            "example": 42,
            "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
@ -486,8 +504,8 @@
            "default": "null",
            "example": 1,
            "nullable": true,
-            "minimum": 0.0,
+            "minimum": 0,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "decoder_input_details": {
            "type": "boolean",
@ -505,10 +523,10 @@
          "max_new_tokens": {
            "type": "integer",
            "format": "int32",
-            "default": "20",
+            "default": "null",
-            "minimum": 0.0,
+            "example": "20",
-            "exclusiveMaximum": 512.0,
+            "nullable": true,
-            "exclusiveMinimum": 0.0
+            "minimum": 0
          },
          "repetition_penalty": {
            "type": "number",
@ -516,7 +534,7 @@
            "default": "null",
            "example": 1.03,
            "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "return_full_text": {
            "type": "boolean",
@ -530,8 +548,8 @@
            "default": "null",
            "example": "null",
            "nullable": true,
-            "minimum": 0.0,
+            "minimum": 0,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "stop": {
            "type": "array",
@ -549,7 +567,7 @@
            "default": "null",
            "example": 0.5,
            "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "top_k": {
            "type": "integer",
@ -557,7 +575,16 @@
            "default": "null",
            "example": 10,
            "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "top_n_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 5,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "top_p": {
            "type": "number",
@ -565,15 +592,15 @@
            "default": "null",
            "example": 0.95,
            "nullable": true,
-            "maximum": 1.0,
+            "maximum": 1,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "truncate": {
            "type": "integer",
            "default": "null",
            "example": "null",
            "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
          },
          "typical_p": {
            "type": "number",
@ -581,8 +608,8 @@
            "default": "null",
            "example": 0.95,
            "nullable": true,
-            "maximum": 1.0,
+            "maximum": 1,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
          },
          "watermark": {
            "type": "boolean",
@ -653,38 +680,38 @@
            "type": "integer",
            "format": "int32",
            "example": "32000",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_concurrent_requests": {
            "type": "integer",
            "description": "Router Parameters",
            "example": "128",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_input_length": {
            "type": "integer",
            "example": "1024",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_stop_sequences": {
            "type": "integer",
            "example": "4",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_total_tokens": {
            "type": "integer",
            "example": "2048",
-            "minimum": 0.0
+            "minimum": 0
          },
          "max_waiting_tokens": {
            "type": "integer",
            "example": "20",
-            "minimum": 0.0
+            "minimum": 0
          },
          "model_device_type": {
            "type": "string",
@ -717,7 +744,7 @@
          "validation_workers": {
            "type": "integer",
            "example": "2",
-            "minimum": 0.0
+            "minimum": 0
          },
          "version": {
            "type": "string",
@ -743,7 +770,7 @@
            "type": "integer",
            "format": "int32",
            "example": 0,
-            "minimum": 0.0
+            "minimum": 0
          },
          "logprob": {
            "type": "number",
@ -771,14 +798,14 @@
            "type": "integer",
            "format": "int32",
            "example": 1,
-            "minimum": 0.0
+            "minimum": 0
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
          }
        }
      },
@ -794,6 +821,7 @@
                "$ref": "#/components/schemas/StreamDetails"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "generated_text": {
@ -804,6 +832,12 @@
          },
          "token": {
            "$ref": "#/components/schemas/Token"
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          }
        }
      },
@ -820,7 +854,7 @@
            "type": "integer",
            "format": "int32",
            "example": 0,
-            "minimum": 0.0
+            "minimum": 0
          },
          "logprob": {
            "type": "number",
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -107,8 +107,8 @@ pub(crate) struct GenerateParameters {
    #[schema(default = "false", example = true)]
    pub do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
-    #[schema(exclusive_minimum = 0, exclusive_maximum = 512, default = "20")]
+    #[schema(nullable = true, default = "null", example = "20")]
-    pub max_new_tokens: u32,
+    pub max_new_tokens: Option<u32>,
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = false)]
    pub return_full_text: Option<bool>,
@ -140,8 +140,8 @@ pub(crate) struct GenerateParameters {
    pub top_n_tokens: Option<u32>,
 }
-fn default_max_new_tokens() -> u32 {
+fn default_max_new_tokens() -> Option<u32> {
-    20
+    None
 }
 fn default_parameters() -> GenerateParameters {
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -67,8 +67,8 @@ impl Validation {
        &self,
        inputs: String,
        truncate: Option<usize>,
-        max_new_tokens: u32,
+        max_new_tokens: Option<u32>,
-    ) -> Result<(String, usize), ValidationError> {
+    ) -> Result<(String, usize, u32), ValidationError> {
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
@ -84,6 +84,11 @@ impl Validation {
            let (inputs, input_length) = response_receiver.await.unwrap()?;
            // Get total tokens
            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
                max_new_tokens
            } else {
                self.max_total_tokens.saturating_sub(input_length) as u32
            };
            let total_tokens = input_length + max_new_tokens as usize;
            // Validate MaxTotalTokens
@ -104,7 +109,7 @@ impl Validation {
            }
            metrics::histogram!("tgi_request_input_length", input_length as f64);
-            Ok((inputs, input_length))
+            Ok((inputs, input_length, max_new_tokens))
        }
        // Return inputs without validation
        else {
@ -112,6 +117,11 @@ impl Validation {
            // However, the inputs will be truncated by the python servers
            // We make sure that truncate + max_new_tokens <= self.max_total_tokens
            let input_length = truncate.unwrap_or(self.max_input_length);
            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
                max_new_tokens
            } else {
                self.max_total_tokens.saturating_sub(input_length) as u32
            };
            // Validate MaxNewTokens
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
@ -121,7 +131,7 @@ impl Validation {
                ));
            }
-            Ok((inputs, input_length))
+            Ok((inputs, input_length, max_new_tokens))
        }
    }
@ -200,7 +210,7 @@ impl Validation {
            })
            .unwrap_or(Ok(0))?;
-        if max_new_tokens == 0 {
+        if max_new_tokens == Some(0) {
            return Err(ValidationError::NegativeMaxNewTokens);
        }
@ -247,7 +257,7 @@ impl Validation {
            .unwrap_or(Ok(None))?;
        // Validate inputs
-        let (inputs, input_length) = self
+        let (inputs, input_length, max_new_tokens) = self
            .validate_input(request.inputs, truncate, max_new_tokens)
            .await?;
@ -426,7 +436,7 @@ mod tests {
        let max_new_tokens = 10;
        match validation
-            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
            .await
        {
            Err(ValidationError::MaxNewTokens(1, 10)) => (),
@ -455,7 +465,7 @@ mod tests {
        let max_new_tokens = 10;
        match validation
-            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
            .await
        {
            Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
@ -534,7 +544,6 @@ mod tests {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(0.99),
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
@ -549,7 +558,6 @@ mod tests {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: None,
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
@ -596,7 +604,6 @@ mod tests {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_n_tokens: Some(4),
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
@ -608,7 +615,6 @@ mod tests {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_n_tokens: Some(0),
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })
@ -620,7 +626,6 @@ mod tests {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_n_tokens: None,
                    max_new_tokens: 1,
                    ..default_parameters()
                },
            })