fix in regression in ipex flashattention

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-12 12:54:52 +00:00 · 2024-08-06 22:00:36 -07:00 · 2024-08-06 22:00:36 -07:00 · 6abcab843d
commit 6abcab843d
parent 30e70f2ceb
1 changed files with 3 additions and 1 deletions
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -21,7 +21,7 @@ def attention(
    out = torch.empty_like(q)
    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-    return ipex.llm.functional.varlen_attention(
+    ipex.llm.functional.varlen_attention(
        q,
        k,
        v,
@ -38,6 +38,8 @@ def attention(
        None,
    )
    return out
 def reshape_and_cache(
    key: torch.Tensor,