softcap default -1.0

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 19:34:53 +00:00 · 2025-04-13 20:02:05 -07:00 · 2025-04-13 20:02:05 -07:00 · ce8548f5c4
commit ce8548f5c4
parent f8c8c3d397
1 changed files with 2 additions and 0 deletions
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -39,6 +39,8 @@ def attention(
    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
    if ATTENTION == "flashdecoding-ipex":
        window_size_right = -1 if window_size_left == -1 else 0
        if softcap is None:
            softcap = -1.0
        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
            out,
            query.contiguous() if query.device.type == "xpu" else query,