HF_TOKEN replaces HUGGING_FACE_HUB_TOKEN as it is deprecated (#253)

This commit is contained in:
Sun Choi 2024-12-15 00:59:58 -08:00 committed by GitHub
parent d49ce00f40
commit cc2ca4ac22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 16 additions and 12 deletions

View File

@ -147,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!("Downloading tokenizer"); tracing::info!("Downloading tokenizer");
// Parse Huggingface hub token // Parse Huggingface hub token
let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); let auth_token = std::env::var("HF_TOKEN")
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
.ok();
// Download and instantiate tokenizer // Download and instantiate tokenizer
// We need to download it outside of the Tokio runtime // We need to download it outside of the Tokio runtime

View File

@ -2,13 +2,13 @@
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens) If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example: If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
``` ```
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN> export HF_TOKEN=<YOUR READ TOKEN>
``` ```
If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below. If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
```bash ```bash
model=meta-llama/Llama-2-7b-chat-hf model=meta-llama/Llama-2-7b-chat-hf
@ -17,7 +17,7 @@ token=<your READ token>
docker run --gpus all \ docker run --gpus all \
--shm-size 1g \ --shm-size 1g \
-e HUGGING_FACE_HUB_TOKEN=$token \ -e HF_TOKEN=$token \
-p 8080:80 \ -p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \ -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \
--model-id $model --model-id $model

View File

@ -22,7 +22,7 @@ To run benchmark use below command:
python run_generation --model_id MODEL_ID python run_generation --model_id MODEL_ID
``` ```
where `MODEL_ID` should be set to the same value as in the TGI server instance. where `MODEL_ID` should be set to the same value as in the TGI server instance.
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HUGGING_FACE_HUB_TOKEN=<token>` with a valid Hugging Face Hub read token. > For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
All possible parameters are described in the below table: All possible parameters are described in the below table:
<div align="left"> <div align="left">

View File

@ -31,7 +31,7 @@ from text_generation.types import (
) )
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) HUGGING_FACE_HUB_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
@ -427,7 +427,7 @@ def launcher(event_loop):
env["USE_FLASH_ATTENTION"] = "false" env["USE_FLASH_ATTENTION"] = "false"
if HUGGING_FACE_HUB_TOKEN is not None: if HUGGING_FACE_HUB_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN env["HF_TOKEN"] = HUGGING_FACE_HUB_TOKEN
volumes = [] volumes = []
if DOCKER_VOLUME: if DOCKER_VOLUME:

View File

@ -578,7 +578,7 @@ fn shard_manager(
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// Detect rope scaling // Detect rope scaling
@ -912,7 +912,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// If args.weights_cache_override is some, pass it to the download process // If args.weights_cache_override is some, pass it to the download process
@ -1212,7 +1212,7 @@ fn spawn_webserver(
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// Parse Compute type // Parse Compute type

View File

@ -179,7 +179,9 @@ async fn main() -> Result<(), RouterError> {
}); });
// Parse Huggingface hub token // Parse Huggingface hub token
let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); let authorization_token = std::env::var("HF_TOKEN")
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
.ok();
// Tokenizer instance // Tokenizer instance
// This will only be used to validate payloads // This will only be used to validate payloads