direct return in clamp like rocm

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-11 20:34:54 +00:00 · 2024-10-10 23:02:56 -07:00 · 2024-10-10 23:02:56 -07:00 · b392362e9e
commit b392362e9e
parent f213012b08
2 changed files with 7 additions and 17 deletions
--- a/server/text_generation_server/layers/attention/common.py
+++ b/server/text_generation_server/layers/attention/common.py
@ -66,7 +66,7 @@ else:
        max_k: int

        def clamp(self, max):
-            if SYSTEM == "rocm":
+            if SYSTEM == "rocm" or SYSTEM == "ipex":
                return self
            raise NotImplementedError("Not implemented seqlen for paged")
            return Seqlen(torch.clamp(self.input_lengths, max=max))
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -46,7 +46,7 @@ from text_generation_server.models.globals import (
    TGI_WIGGLE_ROOM,
    get_adapter_to_index,
 )
-from text_generation_server.layers.attention import KVCache, Seqlen, SUPPORTS_WINDOWING
+from text_generation_server.layers.attention import KVCache, Seqlen
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
 from text_generation_server.utils.quantization import get_loader
@ -993,21 +993,6 @@ class FlashCausalLM(Model):
        )

        prefix = ""
-        if getattr(config, "sliding_window", None) is not None and SUPPORTS_WINDOWING:
-            set_sliding_window(config.sliding_window)
-        else:
-            config.sliding_window = None
-
-        text_config = getattr(config, "text_config", None)
-        if text_config:
-            if (
-                getattr(text_config, "sliding_window", None) is not None
-                and SUPPORTS_WINDOWING
-            ):
-                set_sliding_window(text_config.sliding_window)
-            else:
-                text_config.sliding_window = None
-
        model = model_class(prefix, config, weights)
        torch.distributed.barrier(group=self.process_group)

@ -1016,6 +1001,11 @@ class FlashCausalLM(Model):
        if text_config is not None:
            config = text_config

+        if getattr(config, "sliding_window", None) is not None:
+            set_sliding_window(config.sliding_window)
+        else:
+            config.sliding_window = None
+
        self.num_layers = config.num_hidden_layers
        self.num_heads = config.num_attention_heads // self.process_group.size()
        # Validation is done in the model itself