mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-26 20:42:06 +00:00
Merge branch 'habana-main' into 2.3.0
This commit is contained in:
commit
15de6c9195
@ -155,7 +155,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||||||
// We need to download it outside of the Tokio runtime
|
// We need to download it outside of the Tokio runtime
|
||||||
let params = FromPretrainedParameters {
|
let params = FromPretrainedParameters {
|
||||||
revision,
|
revision,
|
||||||
auth_token,
|
token: auth_token,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
|
Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
|
||||||
|
@ -22,7 +22,7 @@ To run benchmark use below command:
|
|||||||
python run_generation --model_id MODEL_ID
|
python run_generation --model_id MODEL_ID
|
||||||
```
|
```
|
||||||
where `MODEL_ID` should be set to the same value as in the TGI server instance.
|
where `MODEL_ID` should be set to the same value as in the TGI server instance.
|
||||||
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HUGGING_FACE_HUB_TOKEN=<token>` with a valid Hugging Face Hub read token.
|
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
|
||||||
|
|
||||||
All possible parameters are described in the below table:
|
All possible parameters are described in the below table:
|
||||||
<div align="left">
|
<div align="left">
|
||||||
|
@ -32,7 +32,7 @@ from text_generation.types import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
|
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
|
||||||
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
HUGGING_FACE_HUB_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
|
||||||
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
|
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
|
||||||
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
|
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
|
||||||
|
|
||||||
@ -498,8 +498,8 @@ def launcher(event_loop):
|
|||||||
if attention is not None:
|
if attention is not None:
|
||||||
env["ATTENTION"] = attention
|
env["ATTENTION"] = attention
|
||||||
|
|
||||||
if HF_TOKEN is not None:
|
if HUGGING_FACE_HUB_TOKEN is not None:
|
||||||
env["HF_TOKEN"] = HF_TOKEN
|
env["HF_TOKEN"] = HUGGING_FACE_HUB_TOKEN
|
||||||
|
|
||||||
volumes = []
|
volumes = []
|
||||||
if DOCKER_VOLUME:
|
if DOCKER_VOLUME:
|
||||||
|
@ -689,7 +689,6 @@ async fn completions(
|
|||||||
..
|
..
|
||||||
} = req;
|
} = req;
|
||||||
|
|
||||||
let max_new_tokens = max_tokens.or(Some(100));
|
|
||||||
let stop = stop.unwrap_or_default();
|
let stop = stop.unwrap_or_default();
|
||||||
// enable greedy only when temperature is 0
|
// enable greedy only when temperature is 0
|
||||||
let (do_sample, temperature) = match temperature {
|
let (do_sample, temperature) = match temperature {
|
||||||
@ -740,7 +739,7 @@ async fn completions(
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
typical_p: None,
|
typical_p: None,
|
||||||
do_sample,
|
do_sample,
|
||||||
max_new_tokens,
|
max_new_tokens: max_tokens,
|
||||||
return_full_text: None,
|
return_full_text: None,
|
||||||
stop: stop.clone(),
|
stop: stop.clone(),
|
||||||
truncate: None,
|
truncate: None,
|
||||||
|
Loading…
Reference in New Issue
Block a user