diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 34b91a92..86c2db70 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -147,7 +147,9 @@ fn main() -> Result<(), Box> { tracing::info!("Downloading tokenizer"); // Parse Huggingface hub token - let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); + let auth_token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); // Download and instantiate tokenizer // We need to download it outside of the Tokio runtime diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index 970afa0e..2999e2e0 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -2,13 +2,13 @@ If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens) -If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example: +If you're using the CLI, set the `HF_TOKEN` environment variable. For example: ``` -export HUGGING_FACE_HUB_TOKEN= +export HF_TOKEN= ``` -If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below. +If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below. ```bash model=meta-llama/Llama-2-7b-chat-hf @@ -17,7 +17,7 @@ token= docker run --gpus all \ --shm-size 1g \ - -e HUGGING_FACE_HUB_TOKEN=$token \ + -e HF_TOKEN=$token \ -p 8080:80 \ -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \ --model-id $model diff --git a/examples/README.md b/examples/README.md index e605364e..226595c6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,7 +22,7 @@ To run benchmark use below command: python run_generation --model_id MODEL_ID ``` where `MODEL_ID` should be set to the same value as in the TGI server instance. -> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HUGGING_FACE_HUB_TOKEN=` with a valid Hugging Face Hub read token. +> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=` with a valid Hugging Face Hub read token. All possible parameters are described in the below table:
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index ae3f977b..7665f589 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -31,7 +31,7 @@ from text_generation.types import ( ) DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) -HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) +HUGGING_FACE_HUB_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") @@ -427,7 +427,7 @@ def launcher(event_loop): env["USE_FLASH_ATTENTION"] = "false" if HUGGING_FACE_HUB_TOKEN is not None: - env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN + env["HF_TOKEN"] = HUGGING_FACE_HUB_TOKEN volumes = [] if DOCKER_VOLUME: diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 88b0db57..effd0b46 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -578,7 +578,7 @@ fn shard_manager( // Parse Inference API token if let Ok(api_token) = env::var("HF_API_TOKEN") { - envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) + envs.push(("HF_TOKEN".into(), api_token.into())) }; // Detect rope scaling @@ -912,7 +912,7 @@ fn download_convert_model(args: &Args, running: Arc) -> Result<(), L // Parse Inference API token if let Ok(api_token) = env::var("HF_API_TOKEN") { - envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) + envs.push(("HF_TOKEN".into(), api_token.into())) }; // If args.weights_cache_override is some, pass it to the download process @@ -1212,7 +1212,7 @@ fn spawn_webserver( // Parse Inference API token if let Ok(api_token) = env::var("HF_API_TOKEN") { - envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) + envs.push(("HF_TOKEN".into(), api_token.into())) }; // Parse Compute type diff --git a/router/src/main.rs b/router/src/main.rs index 4f9f0f73..9f78a5b2 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -179,7 +179,9 @@ async fn main() -> Result<(), RouterError> { }); // Parse Huggingface hub token - let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); + let authorization_token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); // Tokenizer instance // This will only be used to validate payloads