hotfix: avoid non-prefilled block use when using prefix caching (#2489)

The minimum batch size logic could cause prefix blocks to be deallocated without prefill. The next allocation of the same prefix would then use garbage blocks.
2025-09-11 20:34:54 +00:00 · 2024-09-05 15:09:29 +02:00 · 2024-09-05 15:09:29 +02:00 · c7b495f97d
commit c7b495f97d
parent 34a6399a50
1 changed files with 5 additions and 2 deletions
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -122,7 +122,7 @@ impl Backend for BackendV3 {
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn batching_task(
    mut client: ShardedClient,
-    waiting_served_ratio: f32,
+    _waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: u32,
    max_waiting_tokens: usize,
@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
                    None
                } else {
                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    // TODO: temporarily disable to avoid incorrect deallocation +
+                    //       reallocation when using prefix caching.
+                    // Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    None
                };

                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);