0.98

2025-09-10 20:04:52 +00:00 · 2023-07-19 02:06:16 +02:00 · 2023-07-19 02:06:16 +02:00 · 2934543a59
commit 2934543a59
parent 406b094002
1 changed files with 2 additions and 2 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -743,9 +743,9 @@ class FlashCausalLM(Model):

        total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory

-        # 0.985 to add some wiggle room
+        # 0.98 to add some wiggle room
        num_blocks = (
-            int((total_gpu_memory * 0.985 - peak_memory) // total_cache_size)
+            int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size)
            # Add batch.blocks as we allocated it above, so it is included in the peak memory.
            + batch.blocks
        )