Fixing bug in mllama.

2025-09-18 15:54:53 +00:00 · 2025-04-07 09:18:29 +02:00 · 2025-04-07 09:18:29 +02:00 · d239884b8e
commit d239884b8e
parent 9b50bada65
1 changed files with 3 additions and 9 deletions
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@ -256,12 +256,6 @@ class MllamaCausalLM(VlmCausalLM):
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices
        if cu_seqlen_prefill is None and self.max_past() is not None:
            # In decode, not prefill, we're actually overwriting the KV-cache
            # in a circular buffer mode.
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)
        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@ -356,9 +350,9 @@ class MllamaCausalLM(VlmCausalLM):
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
-        cuda_graph["cache_lengths"][
+        cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
-            : cache_lengths_tensor.shape[0]
+            cache_lengths_tensor
-        ] = cache_lengths_tensor
+        )
        with self._forward_context(
            block_tables=cuda_graph["block_tables"],