From cc2ca4ac2224c272de317b5d5a4c9c022173e077 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Sun, 15 Dec 2024 00:59:58 -0800
Subject: [PATCH 1/2] HF_TOKEN replaces HUGGING_FACE_HUB_TOKEN as it is
 deprecated (#253)

---
 benchmark/src/main.rs                             | 4 +++-
 docs/source/basic_tutorials/gated_model_access.md | 8 ++++----
 examples/README.md                                | 2 +-
 integration-tests/conftest.py                     | 4 ++--
 launcher/src/main.rs                              | 6 +++---
 router/src/main.rs                                | 4 +++-
 6 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 34b91a92..86c2db70 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -147,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             tracing::info!("Downloading tokenizer");
 
             // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN")
+                .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+                .ok();
 
             // Download and instantiate tokenizer
             // We need to download it outside of the Tokio runtime
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 970afa0e..2999e2e0 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -2,13 +2,13 @@
 
 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
 
-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
 
 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```
 
-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
 
 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@@ -17,7 +17,7 @@ token=<your READ token>
 
 docker run --gpus all \
     --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
     -p 8080:80 \
     -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \
     --model-id $model
diff --git a/examples/README.md b/examples/README.md
index e605364e..226595c6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -22,7 +22,7 @@ To run benchmark use below command:
 python run_generation --model_id MODEL_ID
 ```
 where `MODEL_ID` should be set to the same value as in the TGI server instance.
-> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HUGGING_FACE_HUB_TOKEN=<token>` with a valid Hugging Face Hub read token.
+> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
 
 All possible parameters are described in the below table:
 <div align="left">
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index ae3f977b..7665f589 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -31,7 +31,7 @@ from text_generation.types import (
 )
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HUGGING_FACE_HUB_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 
 
@@ -427,7 +427,7 @@ def launcher(event_loop):
             env["USE_FLASH_ATTENTION"] = "false"
 
         if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+            env["HF_TOKEN"] = HUGGING_FACE_HUB_TOKEN
 
         volumes = []
         if DOCKER_VOLUME:
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 88b0db57..effd0b46 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -578,7 +578,7 @@ fn shard_manager(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Detect rope scaling
@@ -912,7 +912,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // If args.weights_cache_override is some, pass it to the download process
@@ -1212,7 +1212,7 @@ fn spawn_webserver(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Parse Compute type
diff --git a/router/src/main.rs b/router/src/main.rs
index 4f9f0f73..9f78a5b2 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -179,7 +179,9 @@ async fn main() -> Result<(), RouterError> {
     });
 
     // Parse Huggingface hub token
-    let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+    let authorization_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
 
     // Tokenizer instance
     // This will only be used to validate payloads

From 61309b283265438927413e24e436e10026b9aa41 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Mon, 16 Dec 2024 00:32:57 -0800
Subject: [PATCH 2/2] Remove the default max_tokens for /v1/chat/completions
 (#251)

---
 Cargo.toml            | 2 +-
 benchmark/src/main.rs | 2 +-
 router/src/server.rs  | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index aafc8435..83972519 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
 [workspace.dependencies]
-tokenizers = { version = "0.19.1", features = ["http"] }
+tokenizers = { version = "0.20.0", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
 
 [profile.release]
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 86c2db70..935808b6 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -155,7 +155,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             // We need to download it outside of the Tokio runtime
             let params = FromPretrainedParameters {
                 revision,
-                auth_token,
+                token: auth_token,
                 ..Default::default()
             };
             Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
diff --git a/router/src/server.rs b/router/src/server.rs
index 1edcc472..b9287080 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -608,7 +608,6 @@ async fn completions(
         ..
     } = req;
 
-    let max_new_tokens = max_tokens.or(Some(100));
     let stop = stop.unwrap_or_default();
     // enable greedy only when temperature is 0
     let (do_sample, temperature) = match temperature {
@@ -657,7 +656,7 @@ async fn completions(
                 top_p: req.top_p,
                 typical_p: None,
                 do_sample,
-                max_new_tokens,
+                max_new_tokens: max_tokens,
                 return_full_text: None,
                 stop: stop.clone(),
                 truncate: None,
@@ -1019,7 +1018,6 @@ async fn chat_completions(
     } = req;
 
     let repetition_penalty = presence_penalty.map(|x| x + 2.0);
-    let max_new_tokens = max_tokens.or(Some(100));
     let logprobs = logprobs.unwrap_or(false);
     let tool_prompt = tool_prompt.unwrap_or_default();
     let stop = stop.unwrap_or_default();
@@ -1081,7 +1079,7 @@ async fn chat_completions(
             top_p: req.top_p,
             typical_p: None,
             do_sample,
-            max_new_tokens,
+            max_new_tokens: max_tokens,
             return_full_text: None,
             stop,
             truncate: None,