From 61309b283265438927413e24e436e10026b9aa41 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Mon, 16 Dec 2024 00:32:57 -0800
Subject: [PATCH] Remove the default max_tokens for /v1/chat/completions (#251)

---
 Cargo.toml            | 2 +-
 benchmark/src/main.rs | 2 +-
 router/src/server.rs  | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index aafc8435..83972519 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
 [workspace.dependencies]
-tokenizers = { version = "0.19.1", features = ["http"] }
+tokenizers = { version = "0.20.0", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
 
 [profile.release]
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 86c2db70..935808b6 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -155,7 +155,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             // We need to download it outside of the Tokio runtime
             let params = FromPretrainedParameters {
                 revision,
-                auth_token,
+                token: auth_token,
                 ..Default::default()
             };
             Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
diff --git a/router/src/server.rs b/router/src/server.rs
index 1edcc472..b9287080 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -608,7 +608,6 @@ async fn completions(
         ..
     } = req;
 
-    let max_new_tokens = max_tokens.or(Some(100));
     let stop = stop.unwrap_or_default();
     // enable greedy only when temperature is 0
     let (do_sample, temperature) = match temperature {
@@ -657,7 +656,7 @@ async fn completions(
                 top_p: req.top_p,
                 typical_p: None,
                 do_sample,
-                max_new_tokens,
+                max_new_tokens: max_tokens,
                 return_full_text: None,
                 stop: stop.clone(),
                 truncate: None,
@@ -1019,7 +1018,6 @@ async fn chat_completions(
     } = req;
 
     let repetition_penalty = presence_penalty.map(|x| x + 2.0);
-    let max_new_tokens = max_tokens.or(Some(100));
     let logprobs = logprobs.unwrap_or(false);
     let tool_prompt = tool_prompt.unwrap_or_default();
     let stop = stop.unwrap_or_default();
@@ -1081,7 +1079,7 @@ async fn chat_completions(
             top_p: req.top_p,
             typical_p: None,
             do_sample,
-            max_new_tokens,
+            max_new_tokens: max_tokens,
             return_full_text: None,
             stop,
             truncate: None,