Fixing falcon.

2025-09-11 20:34:54 +00:00 · 2024-05-29 18:34:34 +00:00 · 2024-05-29 18:34:34 +00:00 · cf59593454
commit cf59593454
parent a76e650283
1 changed files with 3 additions and 5 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -198,9 +198,7 @@ class FlashRWAttention(torch.nn.Module):
        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
-        paged_attention.reshape_and_cache(
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output
        attn_output = torch.empty_like(query)
@ -208,7 +206,7 @@ class FlashRWAttention(torch.nn.Module):
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
+            attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
@ -219,7 +217,7 @@ class FlashRWAttention(torch.nn.Module):
            )
        # Decode
        else:
-            paged_attention.attention(
+            paged_attention(
                attn_output,
                query,
                kv_cache[0],