fix: attempt forward on flash attn2 to check hardware support

2025-09-12 12:54:52 +00:00 · 2024-07-30 17:20:40 +00:00 · 2024-07-30 17:20:40 +00:00 · 4b1005c7e1
commit 4b1005c7e1
parent 53aec27328
1 changed files with 35 additions and 0 deletions
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -173,6 +173,41 @@ def paged_attention(
 try:
    import flash_attn_2_cuda
    # try forwarding to see if it works with all dummy inputs
    batch_size = 1
    num_heads = 1
    head_dim = 1
    seqlen = 1
    try:
        flash_attn_2_cuda.varlen_fwd(
            torch.zeros(batch_size, num_heads, seqlen, head_dim),  # q
            torch.zeros(batch_size, num_heads, seqlen, head_dim),  # k
            torch.zeros(batch_size, num_heads, seqlen, head_dim),  # v
            None,  # out (optional)
            torch.zeros(batch_size + 1, dtype=torch.int32),  # cu_seqlens_q
            torch.zeros(batch_size + 1, dtype=torch.int32),  # cu_seqlens_k
            None,  # alibi_slopes (optional)
            None,  # q_padded (optional)
            None,  # k_padded (optional)
            None,  # v_padded (optional)
            seqlen,  # max_seqlen_q
            seqlen,  # max_seqlen_k
            1.0,  # softmax_scale
            0.0,  # softmax_lse (default value)
            False,  # is_causal
            True,  # return_softmax
            -1,  # window_size_left
            -1,  # window_size_right
            0.0,  # softmax_softcap
            False,  # deterministic
            None,  # rng_state (optional)
        )
    except RuntimeError as e:
        raise ImportError(
            "Flash Attention V2 is not supported on this machine. " f"Error: {e}"
        ) from e
    V2 = True
 except ImportError:
    try: