From 2934543a5958b62a59c395a813b501c322323922 Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Wed, 19 Jul 2023 02:06:16 +0200 Subject: [PATCH] 0.98 --- server/text_generation_server/models/flash_causal_lm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 72c2403c..517fba68 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -743,9 +743,9 @@ class FlashCausalLM(Model): total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory - # 0.985 to add some wiggle room + # 0.98 to add some wiggle room num_blocks = ( - int((total_gpu_memory * 0.985 - peak_memory) // total_cache_size) + int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size) # Add batch.blocks as we allocated it above, so it is included in the peak memory. + batch.blocks )