diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 2ee3d7c5..2ec9c882 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -155,7 +155,7 @@ fn main() -> Result<(), Box> { // We need to download it outside of the Tokio runtime let params = FromPretrainedParameters { revision, - auth_token, + token: auth_token, ..Default::default() }; Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap() diff --git a/examples/README.md b/examples/README.md index e605364e..226595c6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,7 +22,7 @@ To run benchmark use below command: python run_generation --model_id MODEL_ID ``` where `MODEL_ID` should be set to the same value as in the TGI server instance. -> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HUGGING_FACE_HUB_TOKEN=` with a valid Hugging Face Hub read token. +> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=` with a valid Hugging Face Hub read token. All possible parameters are described in the below table:
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index eb55ebb9..14523cd4 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -32,7 +32,7 @@ from text_generation.types import ( ) DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) -HF_TOKEN = os.getenv("HF_TOKEN", None) +HUGGING_FACE_HUB_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") DOCKER_DEVICES = os.getenv("DOCKER_DEVICES") @@ -498,8 +498,8 @@ def launcher(event_loop): if attention is not None: env["ATTENTION"] = attention - if HF_TOKEN is not None: - env["HF_TOKEN"] = HF_TOKEN + if HUGGING_FACE_HUB_TOKEN is not None: + env["HF_TOKEN"] = HUGGING_FACE_HUB_TOKEN volumes = [] if DOCKER_VOLUME: diff --git a/router/src/server.rs b/router/src/server.rs index 15eb1db1..1a909f0f 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -689,7 +689,6 @@ async fn completions( .. } = req; - let max_new_tokens = max_tokens.or(Some(100)); let stop = stop.unwrap_or_default(); // enable greedy only when temperature is 0 let (do_sample, temperature) = match temperature { @@ -740,7 +739,7 @@ async fn completions( top_p: req.top_p, typical_p: None, do_sample, - max_new_tokens, + max_new_tokens: max_tokens, return_full_text: None, stop: stop.clone(), truncate: None,