mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
Fix PR.
This commit is contained in:
parent
29a8d5a3a1
commit
40f693b6b9
@ -354,6 +354,14 @@ Options:
|
||||
|
||||
[env: NGROK_EDGE=]
|
||||
|
||||
```
|
||||
## BATCH_DIMENSION
|
||||
```shell
|
||||
--batch-dimension
|
||||
Specific flag for hardware targets that do not support unpadded inference For those we do not send the tokenizer to the router so that all the scheduling assumes those pad tokens exist (and potentially even more)
|
||||
|
||||
[env: BATCH_DIMENSION=]
|
||||
|
||||
```
|
||||
## TOKENIZER_CONFIG_PATH
|
||||
```shell
|
||||
|
@ -1040,7 +1040,7 @@ fn spawn_webserver(
|
||||
args.model_id,
|
||||
];
|
||||
|
||||
if args.batch_dimension{
|
||||
if args.batch_dimension {
|
||||
router_args.push("--batch-dimension".to_string());
|
||||
}
|
||||
|
||||
|
@ -834,7 +834,7 @@ pub async fn run(
|
||||
max_top_n_tokens,
|
||||
max_input_length,
|
||||
max_total_tokens,
|
||||
batch_dimension
|
||||
batch_dimension,
|
||||
);
|
||||
let generation_health = Arc::new(AtomicBool::new(false));
|
||||
let health_ext = Health::new(client.clone(), generation_health.clone());
|
||||
|
@ -69,7 +69,7 @@ impl Validation {
|
||||
max_top_n_tokens,
|
||||
max_input_length,
|
||||
max_total_tokens,
|
||||
batch_dimension
|
||||
batch_dimension,
|
||||
}
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ impl Validation {
|
||||
) -> Result<(String, usize, u32), ValidationError> {
|
||||
// If we have a fast tokenizer
|
||||
if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
|
||||
if self.batch_dimension{
|
||||
if self.batch_dimension {
|
||||
let input_length = encoding.len();
|
||||
|
||||
// Get total tokens
|
||||
@ -135,7 +135,6 @@ impl Validation {
|
||||
));
|
||||
}
|
||||
|
||||
//
|
||||
metrics::histogram!("tgi_request_input_length", input_length as f64);
|
||||
return Ok((inputs, input_length, max_new_tokens));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user