mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
hotfix: avoid non-prefilled block use when using prefix caching (#2489)
The minimum batch size logic could cause prefix blocks to be deallocated without prefill. The next allocation of the same prefix would then use garbage blocks.
This commit is contained in:
parent
34a6399a50
commit
c7b495f97d
@ -122,7 +122,7 @@ impl Backend for BackendV3 {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn batching_task(
|
||||
mut client: ShardedClient,
|
||||
waiting_served_ratio: f32,
|
||||
_waiting_served_ratio: f32,
|
||||
max_batch_prefill_tokens: u32,
|
||||
max_batch_total_tokens: u32,
|
||||
max_waiting_tokens: usize,
|
||||
@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
|
||||
None
|
||||
} else {
|
||||
// Minimum batch size
|
||||
Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
|
||||
// TODO: temporarily disable to avoid incorrect deallocation +
|
||||
// reallocation when using prefix caching.
|
||||
// Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
|
||||
None
|
||||
};
|
||||
|
||||
let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
|
||||
|
Loading…
Reference in New Issue
Block a user