This commit is contained in:
OlivierDehaene 2023-07-19 01:26:42 +02:00
parent 7f399cd848
commit 0a02801822
3 changed files with 2 additions and 5 deletions

View File

@ -349,7 +349,6 @@ async fn batching_task(
}
metrics::gauge!("tgi_batch_current_size", 0.0);
metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
let _ = client.clear_cache(None).await;
}
}
}

View File

@ -29,8 +29,6 @@ class Cache:
keys = list(self.cache.keys())
for k in keys:
self.delete(k)
if torch.cuda.is_available():
torch.cuda.empty_cache()
def __len__(self):
return len(self.cache.keys())

View File

@ -743,9 +743,9 @@ class FlashCausalLM(Model):
total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
# 0.98 to add some wiggle room
# 0.99 to add some wiggle room
num_blocks = (
int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size)
int((total_gpu_memory * 0.99 - peak_memory) // total_cache_size)
# Add batch.blocks as we allocated it above, so it is included in the peak memory.
+ batch.blocks
)