mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-28 13:32:10 +00:00
Prefer prefill instead of decode when max_waiting_tokens==0 (#18)
This commit is contained in:
parent
60f63262db
commit
da0f874d49
@ -310,8 +310,9 @@ async fn batching_task(
|
|||||||
Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
|
Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
|
||||||
};
|
};
|
||||||
|
|
||||||
let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
|
let mut token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
|
||||||
|
|
||||||
|
loop {
|
||||||
// Try to get a new batch
|
// Try to get a new batch
|
||||||
if let Some((mut new_entries, new_batch, span)) = queue
|
if let Some((mut new_entries, new_batch, span)) = queue
|
||||||
.next_batch(min_size, max_batch_prefill_tokens, token_budget)
|
.next_batch(min_size, max_batch_prefill_tokens, token_budget)
|
||||||
@ -344,9 +345,20 @@ async fn batching_task(
|
|||||||
waiting_tokens = 1;
|
waiting_tokens = 1;
|
||||||
// Extend current batch with the new batch
|
// Extend current batch with the new batch
|
||||||
if let Some(new_cached_batch) = new_cached_batch {
|
if let Some(new_cached_batch) = new_cached_batch {
|
||||||
|
token_budget = token_budget.saturating_sub(new_cached_batch.max_tokens);
|
||||||
entries.extend(new_entries);
|
entries.extend(new_entries);
|
||||||
batches.push(new_cached_batch);
|
batches.push(new_cached_batch);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Break as there is no batch
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop again in case of max_waiting_tokens == 0
|
||||||
|
// to prefer doing next prefill. Break otherwise
|
||||||
|
if max_waiting_tokens != 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create span for this batch to add context to inference calls
|
// Create span for this batch to add context to inference calls
|
||||||
|
Loading…
Reference in New Issue
Block a user