mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
try 0.99
This commit is contained in:
parent
7f399cd848
commit
0a02801822
@ -349,7 +349,6 @@ async fn batching_task(
|
||||
}
|
||||
metrics::gauge!("tgi_batch_current_size", 0.0);
|
||||
metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
|
||||
let _ = client.clear_cache(None).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -29,8 +29,6 @@ class Cache:
|
||||
keys = list(self.cache.keys())
|
||||
for k in keys:
|
||||
self.delete(k)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.cache.keys())
|
||||
|
@ -743,9 +743,9 @@ class FlashCausalLM(Model):
|
||||
|
||||
total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
|
||||
|
||||
# 0.98 to add some wiggle room
|
||||
# 0.99 to add some wiggle room
|
||||
num_blocks = (
|
||||
int((total_gpu_memory * 0.98 - peak_memory) // total_cache_size)
|
||||
int((total_gpu_memory * 0.99 - peak_memory) // total_cache_size)
|
||||
# Add batch.blocks as we allocated it above, so it is included in the peak memory.
|
||||
+ batch.blocks
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user