fix normal att

2025-09-10 03:44:54 +00:00 · 2023-05-29 12:10:17 +02:00 · 2023-05-29 12:10:17 +02:00 · 12ab24ae64
commit 12ab24ae64
parent 63a18c1414
1 changed files with 4 additions and 4 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -125,12 +125,12 @@ class FlashRWAttention(torch.nn.Module):

        # Split query from key_value
        query, kv = qkv.split(
-            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
+            [self.head_size * self.num_heads, 2 * self.head_size * self.num_heads_kv], dim=1
        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
-        kv = kv.view(-1, 2, 1, self.head_size)
+        kv = kv.view(-1, 2, self.num_heads_kv, self.head_size)

        # Inplace rotary
        self.rotary_emb(query, cos, sin)
@ -141,7 +141,7 @@ class FlashRWAttention(torch.nn.Module):
            # Copy to layer past
            layer_past[...] = kv
            # Expand to query shape
-            kv = kv.expand(-1, 2, query.shape[1], self.head_size)
+            kv = kv.expand(-1, 2, self.num_heads, self.head_size)

            # output
            attn_output = torch.empty_like(query)
@ -168,7 +168,7 @@ class FlashRWAttention(torch.nn.Module):
            # Add present to the layer_past tensor at the correct indices
            layer_past[layer_past_present_indices] = kv
            # Expand to query shape
-            kv = layer_past.expand(-1, 2, query.shape[1], self.head_size)
+            kv = layer_past.expand(-1, 2, self.num_heads, self.head_size)

            # output
            attn_output = torch.empty_like(query)