From deec30f89307c7e51ccc609fb2d0ce1e920505b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 5 Sep 2024 15:09:29 +0200 Subject: [PATCH] hotfix: avoid non-prefilled block use when using prefix caching (#2489) The minimum batch size logic could cause prefix blocks to be deallocated without prefill. The next allocation of the same prefix would then use garbage blocks. --- backends/v3/src/backend.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index 05a263705..a47e62dcb 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -122,7 +122,7 @@ impl Backend for BackendV3 { #[allow(clippy::too_many_arguments)] pub(crate) async fn batching_task( mut client: ShardedClient, - waiting_served_ratio: f32, + _waiting_served_ratio: f32, max_batch_prefill_tokens: u32, max_batch_total_tokens: u32, max_waiting_tokens: usize, @@ -168,7 +168,10 @@ pub(crate) async fn batching_task( None } else { // Minimum batch size - Some((batch_size as f32 * waiting_served_ratio).floor() as usize) + // TODO: temporarily disable to avoid incorrect deallocation + + // reallocation when using prefix caching. + // Some((batch_size as f32 * waiting_served_ratio).floor() as usize) + None }; let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);