diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index cbdf4808..2e7c41d9 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -659,6 +659,7 @@ class CausalLM(Model): # We finished all generations in the batch; there is no next batch if stopped: + torch.cuda.empty_cache() return generations, None # Slice unused values from prefill