From ce8548f5c4d49f020e102c9a7c2200244ce29a13 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Sun, 13 Apr 2025 20:02:05 -0700
Subject: [PATCH] softcap default -1.0

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 server/text_generation_server/layers/attention/ipex.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
index 31b745f0..36ef2efc 100644
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -39,6 +39,8 @@ def attention(
     # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
     if ATTENTION == "flashdecoding-ipex":
         window_size_right = -1 if window_size_left == -1 else 0
+        if softcap is None:
+            softcap = -1.0
         ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
             out,
             query.contiguous() if query.device.type == "xpu" else query,