From 02f0083c7ac8239420c633f8e4d5431323e43884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 5 Sep 2024 12:04:29 +0000
Subject: [PATCH] hotfix: avoid non-prefilled block use when using prefix
 caching

The minimum batch size logic could cause prefix blocks to be
deallocated without prefill. The next allocation of the same
prefix would then use garbage blocks.
---
 backends/v3/src/backend.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index 05a26370..a47e62dc 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn batching_task(
     mut client: ShardedClient,
-    waiting_served_ratio: f32,
+    _waiting_served_ratio: f32,
     max_batch_prefill_tokens: u32,
     max_batch_total_tokens: u32,
     max_waiting_tokens: usize,
@@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
                     None
                 } else {
                     // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    // TODO: temporarily disable to avoid incorrect deallocation +
+                    //       reallocation when using prefix caching.
+                    // Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    None
                 };
 
                 let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);