try 0.99

2025-09-10 11:54:52 +00:00 · 2023-07-19 01:26:42 +02:00 · 2023-07-19 01:26:42 +02:00 · 0a02801822
commit 0a02801822
parent 7f399cd848
3 changed files with 2 additions and 5 deletions
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -349,7 +349,6 @@ async fn batching_task(
            }
            metrics::gauge!("tgi_batch_current_size", 0.0);
            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
-            let _ = client.clear_cache(None).await;
        }
    }
 }
--- a/server/text_generation_server/cache.py
+++ b/server/text_generation_server/cache.py
@ -29,8 +29,6 @@ class Cache:
        keys = list(self.cache.keys())
        for k in keys:
            self.delete(k)
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()

    def __len__(self):
        return len(self.cache.keys())
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -743,9 +743,9 @@ class FlashCausalLM(Model):

        total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory

-        # 0.98 to add some wiggle room
+        # 0.99 to add some wiggle room
        num_blocks = (
-            int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size)
+            int((total_gpu_memory * 0.99 - peak_memory) // total_cache_size)
            # Add batch.blocks as we allocated it above, so it is included in the peak memory.
            + batch.blocks
        )