From deec30f89307c7e51ccc609fb2d0ce1e920505b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 5 Sep 2024 15:09:29 +0200
Subject: [PATCH] hotfix: avoid non-prefilled block use when using prefix
 caching (#2489)

The minimum batch size logic could cause prefix blocks to be
deallocated without prefill. The next allocation of the same
prefix would then use garbage blocks.
---
 backends/v3/src/backend.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index 05a26370..a47e62dc 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn batching_task(
     mut client: ShardedClient,
-    waiting_served_ratio: f32,
+    _waiting_served_ratio: f32,
     max_batch_prefill_tokens: u32,
     max_batch_total_tokens: u32,
     max_waiting_tokens: usize,
@@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
                     None
                 } else {
                     // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    // TODO: temporarily disable to avoid incorrect deallocation +
+                    //       reallocation when using prefix caching.
+                    // Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    None
                 };
 
                 let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);