ipex cpu could also support in function

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-17 15:24:52 +00:00 · 2025-04-13 20:49:35 -07:00 · 2025-04-13 20:49:35 -07:00 · 74ad8ed300
commit 74ad8ed300
parent 50282e3cc1
3 changed files with 14 additions and 8 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -201,9 +201,8 @@ except ImportError as e:
 if MAMBA_AVAILABLE:
    __all__.append(Mamba)
-FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available() or (
+FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available() or SYSTEM == "ipex"
-    hasattr(torch, "xpu") and torch.xpu.is_available()
+
 )
 try:
    from text_generation_server.models.transformers_flash_causal_lm import (
        TransformersFlashCausalLM,
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@ -12,7 +12,7 @@ from text_generation_server.utils import initialize_torch_distributed
 from text_generation_server.layers.attention import paged_attention, attention, Seqlen
 from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
 from text_generation_server.models.globals import ATTENTION
-
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@ -115,8 +115,11 @@ class TransformersFlashCausalLM(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
-        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
            else:
                device = torch.device("cpu")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(
--- a/server/text_generation_server/models/transformers_flash_vlm.py
+++ b/server/text_generation_server/models/transformers_flash_vlm.py
@ -14,6 +14,7 @@ from text_generation_server.layers.attention import paged_attention, attention,
 from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
 from text_generation_server.models.globals import ATTENTION
 import torch.nn.functional as F
 from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@ -174,8 +175,11 @@ class TransformersFlashVlmCausalLM(VlmCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
-        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
            else:
                device = torch.device("cpu")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(