From 61309b283265438927413e24e436e10026b9aa41 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Mon, 16 Dec 2024 00:32:57 -0800 Subject: [PATCH] Remove the default max_tokens for /v1/chat/completions (#251) --- Cargo.toml | 2 +- benchmark/src/main.rs | 2 +- router/src/server.rs | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index aafc8435..83972519 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" [workspace.dependencies] -tokenizers = { version = "0.19.1", features = ["http"] } +tokenizers = { version = "0.20.0", features = ["http"] } hf-hub = { version = "0.3.1", features = ["tokio"] } [profile.release] diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 86c2db70..935808b6 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -155,7 +155,7 @@ fn main() -> Result<(), Box> { // We need to download it outside of the Tokio runtime let params = FromPretrainedParameters { revision, - auth_token, + token: auth_token, ..Default::default() }; Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap() diff --git a/router/src/server.rs b/router/src/server.rs index 1edcc472..b9287080 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -608,7 +608,6 @@ async fn completions( .. } = req; - let max_new_tokens = max_tokens.or(Some(100)); let stop = stop.unwrap_or_default(); // enable greedy only when temperature is 0 let (do_sample, temperature) = match temperature { @@ -657,7 +656,7 @@ async fn completions( top_p: req.top_p, typical_p: None, do_sample, - max_new_tokens, + max_new_tokens: max_tokens, return_full_text: None, stop: stop.clone(), truncate: None, @@ -1019,7 +1018,6 @@ async fn chat_completions( } = req; let repetition_penalty = presence_penalty.map(|x| x + 2.0); - let max_new_tokens = max_tokens.or(Some(100)); let logprobs = logprobs.unwrap_or(false); let tool_prompt = tool_prompt.unwrap_or_default(); let stop = stop.unwrap_or_default(); @@ -1081,7 +1079,7 @@ async fn chat_completions( top_p: req.top_p, typical_p: None, do_sample, - max_new_tokens, + max_new_tokens: max_tokens, return_full_text: None, stop, truncate: None,