From 0a028018224b5e8a2e3621314504802f1a2083fb Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Wed, 19 Jul 2023 01:26:42 +0200 Subject: [PATCH] try 0.99 --- router/src/infer.rs | 1 - server/text_generation_server/cache.py | 2 -- server/text_generation_server/models/flash_causal_lm.py | 4 ++-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/router/src/infer.rs b/router/src/infer.rs index 395c048a..188ddc64 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -349,7 +349,6 @@ async fn batching_task( } metrics::gauge!("tgi_batch_current_size", 0.0); metrics::gauge!("tgi_batch_current_max_tokens", 0.0); - let _ = client.clear_cache(None).await; } } } diff --git a/server/text_generation_server/cache.py b/server/text_generation_server/cache.py index bfe042bf..4504733e 100644 --- a/server/text_generation_server/cache.py +++ b/server/text_generation_server/cache.py @@ -29,8 +29,6 @@ class Cache: keys = list(self.cache.keys()) for k in keys: self.delete(k) - if torch.cuda.is_available(): - torch.cuda.empty_cache() def __len__(self): return len(self.cache.keys()) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 517fba68..ef7a1532 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -743,9 +743,9 @@ class FlashCausalLM(Model): total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory - # 0.98 to add some wiggle room + # 0.99 to add some wiggle room num_blocks = ( - int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size) + int((total_gpu_memory * 0.99 - peak_memory) // total_cache_size) # Add batch.blocks as we allocated it above, so it is included in the peak memory. + batch.blocks )