add syncs

2025-09-10 20:04:52 +00:00 · 2023-07-18 17:03:29 +02:00 · 2023-07-18 17:03:29 +02:00 · 1686a7c0dc
commit 1686a7c0dc
parent 160a50af77
1 changed files with 2 additions and 1 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -733,7 +733,7 @@ class FlashCausalLM(Model):
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the
        # profiled peak memory.
-        torch.cuda.synchronize()
+        torch.cuda.synchronize(self.device)
        peak_memory = torch.cuda.max_memory_allocated(self.device)

        dtype_size = torch.tensor([], dtype=self.dtype).element_size()
@ -755,6 +755,7 @@ class FlashCausalLM(Model):
        del CACHE_MANAGER
        del batch
        torch.cuda.empty_cache()
+        torch.cuda.synchronize(self.device)

        CACHE_MANAGER = CacheManager(
            num_blocks,