From 406b094002832cb483bfa5e6620c8fdaa4463b3b Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Wed, 19 Jul 2023 01:50:19 +0200
Subject: [PATCH] 0.985

---
 server/text_generation_server/models/flash_causal_lm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index ef7a1532..72c2403c 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -743,9 +743,9 @@ class FlashCausalLM(Model):
 
         total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
 
-        # 0.99 to add some wiggle room
+        # 0.985 to add some wiggle room
         num_blocks = (
-            int((total_gpu_memory * 0.99 - peak_memory) // total_cache_size)
+            int((total_gpu_memory * 0.985 - peak_memory) // total_cache_size)
             # Add batch.blocks as we allocated it above, so it is included in the peak memory.
             + batch.blocks
         )