Adress comments + fix 2nd path in falcon.

2025-09-11 20:34:54 +00:00 · 2024-05-31 12:43:13 +00:00 · 2024-05-31 12:43:13 +00:00 · d44688b6ac
commit d44688b6ac
parent c67539fbcc
2 changed files with 38 additions and 42 deletions
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -9,7 +9,6 @@ _PARTITION_SIZE = 512
 use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
 ENGINE = "triton" if use_triton else "ck"
 from .flash_attn_triton import triton_attention
 try:
    from vllm._C import cache_ops
@ -122,14 +121,12 @@ def paged_attention(
        )
-try:
+if ENGINE != "triton":
    try:
        import flash_attn_2_cuda
    if ENGINE == "triton":
        logger.info("ROCm: using Flash Attention 2 Triton implementation.")
    elif ENGINE == "ck":
        logger.info("ROCm: using Flash Attention 2 Composable Kernel implementation.")
-except ImportError:
+    except ImportError:
        try:
            import flash_attn_cuda
@ -158,7 +155,7 @@ except ImportError:
                            f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
                        )
                raise ImportError(
-                f"AMD GPU with Rocm capability {major} {minor} is not supported"
+                    f"AMD GPU with ROCm capability {major} {minor} is not supported"
                ) from e
@ -180,7 +177,7 @@ if ENGINE == "ck":
            raise ValueError("`window_size_left` must be > 0 or -1")
        if window_size_left != -1:
            raise ValueError(
-                f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
+                f"ROCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
            )
        return flash_attn_2_cuda.varlen_fwd(
            q,
@ -205,6 +202,7 @@ if ENGINE == "ck":
        )
 elif ENGINE == "triton":
    from .flash_attn_triton import triton_attention
    def attention(
        q,
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -198,9 +198,7 @@ class FlashRWAttention(torch.nn.Module):
        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
-        paged_attention.reshape_and_cache(
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output
        attn_output = torch.empty_like(query)
@ -208,7 +206,7 @@ class FlashRWAttention(torch.nn.Module):
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
+            attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
@ -219,7 +217,7 @@ class FlashRWAttention(torch.nn.Module):
            )
        # Decode
        else:
-            paged_attention.attention(
+            paged_attention(
                attn_output,
                query,
                kv_cache[0],