qwen2 sliding window fix, mllama does not contain sliding window

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 03:14:53 +00:00 · 2025-06-26 22:53:24 -07:00 · 2025-06-26 22:53:24 -07:00 · 99323542f0
commit 99323542f0
parent 800281113f
4 changed files with 25 additions and 49 deletions
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -62,7 +62,9 @@ class Qwen2Attention(torch.nn.Module):
    ):
        super().__init__()
        self.max_past = (
-            config.sliding_window if config.sliding_window is not None else -1
+            config.sliding_window
+            if config.use_sliding_window and config.sliding_window is not None
+            else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1510,6 +1510,8 @@ class FlashCausalLM(Model):

        if getattr(config, "sliding_window", None) is None:
            config.sliding_window = None
+        if getattr(config, "use_sliding_window", True) is False:
+            config.sliding_window = None

        self.num_layers = config.num_hidden_layers
        self.num_heads = config.num_attention_heads // self.process_group.size()
--- a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
@ -1059,17 +1059,6 @@ class FlashVlmCausalLM(FlashCausalLM):
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)

-        kwargs = {}
-        if htorch.utils.internal.is_lazy():
-            batch_size = input_lengths.shape[0]
-            seqlen = (
-                input_ids.shape[0] // batch_size
-                if batch.prefilling
-                else batch.hpu_attn_meta.block_list.shape[0]
-            )
-            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
-                batch.prefilling, seqlen, batch_size
-            )
        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[batch.prefill_cache_indices] = slots
@ -1082,6 +1071,26 @@ class FlashVlmCausalLM(FlashCausalLM):
        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
+        kwargs = {}
+        batch_size = input_lengths.shape[0]
+        prompt_len = (
+            input_ids.shape[0] // batch_size
+            if batch.prefilling
+            else batch.hpu_attn_meta.block_list.shape[0]
+        )
+        if htorch.utils.internal.is_lazy():
+            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
+                batch.prefilling, prompt_len, batch_size
+            )
+        if self.sliding_window is not None:
+            attn_mask = seqlen.make_sliding_window_bias(
+                input_lengths.tolist(),
+                self.sliding_window,
+                self.dtype,
+                prompt_len,
+                batch_size,
+            )
+            seqlen.attn_mask = _async_h2d_tensor_copy(attn_mask)
        logits, speculative_logits = self.model.forward(
            inputs_embeds=inputs_embeds,
            position_ids=_async_h2d_tensor_copy(position_ids),
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@ -282,43 +282,6 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
            block_mapping=None,
            attn_bias=None,
        )
-        if self.sliding_window is not None:
-            block_tables_in_window = []
-            for i, bt in enumerate(block_tables):
-                block_num_in_window = (
-                    self.sliding_window + BLOCK_SIZE - 1
-                ) // BLOCK_SIZE
-                block_tables_in_window.append(
-                    bt[max(0, blocks[i] - block_num_in_window) : blocks[i]]
-                )
-            slots_in_window = []
-            start_idx = 0
-            for i, indice in enumerate(slot_indices):
-                mask = (
-                    indice - torch.arange(start_idx, indice + 1)
-                ) < self.sliding_window
-                slots_in_window.append(torch.arange(start_idx, indice + 1)[mask])
-                start_idx += blocks[i] * BLOCK_SIZE
-            slots_in_window = torch.cat(slots_in_window, dim=0)
-            (
-                block_list_in_window,
-                block_groups_in_window,
-                block_usage_in_window,
-                slots_in_window_mask,
-                _,
-            ) = generate_block_metadata(
-                self.dtype,
-                self.use_contiguous_pa,
-                slots,
-                block_tables_in_window,
-                self.bucketing_ctx,
-                slots_in_window,
-                block_bucket_size,
-            )
-            meta.block_list_in_window = _async_h2d_tensor_copy(block_list_in_window)
-            meta.block_groups_in_window = _async_h2d_tensor_copy(block_groups_in_window)
-            meta.block_usage_in_window = _async_h2d_tensor_copy(block_usage_in_window)
-            meta.slots_in_window_mask = _async_h2d_tensor_copy(slots_in_window_mask)

        hpu_attention_meta = trim_attn_metadata(meta)
        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.