transformers flash llm/vlm enabling in xpu

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-10 03:44:54 +00:00 · 2025-04-08 18:36:28 -07:00 · 2025-04-08 18:36:28 -07:00 · 50282e3cc1
commit 50282e3cc1
parent 24bec29ffc
5 changed files with 14 additions and 7 deletions
--- a/6
+++ b/6
@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
 RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc libnl-genl-3-200
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@ -98,9 +98,7 @@ ENV HF_HOME=/data \
 WORKDIR /usr/src
-RUN pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/test/xpu
+RUN pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/xpu
 RUN pip install triton-xpu==3.2.0b1 --no-cache-dir
 # Install server
 COPY proto proto
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -201,7 +201,9 @@ except ImportError as e:
 if MAMBA_AVAILABLE:
    __all__.append(Mamba)
-FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available()
+FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available() or (
    hasattr(torch, "xpu") and torch.xpu.is_available()
 )
 try:
    from text_generation_server.models.transformers_flash_causal_lm import (
        TransformersFlashCausalLM,
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@ -116,7 +116,7 @@ class TransformersFlashCausalLM(FlashCausalLM):
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
-            device = torch.device("xpu")
+            device = torch.device(f"xpu:{rank}")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(
--- a/server/text_generation_server/models/transformers_flash_vlm.py
+++ b/server/text_generation_server/models/transformers_flash_vlm.py
@ -175,7 +175,7 @@ class TransformersFlashVlmCausalLM(VlmCausalLM):
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
-            device = torch.device("xpu")
+            device = torch.device(f"xpu:{rank}")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@ -73,6 +73,13 @@ def initialize_torch_distributed():
            if SYSTEM == "ipex":
                import intel_extension_for_pytorch as ipex
                if torch.xpu.is_available():
                    assert (
                        WORLD_SIZE <= torch.xpu.device_count()
                    ), "Each process is one xpu"
                    device = RANK % torch.xpu.device_count()
                    torch.xpu.set_device(device)
                ipex.distributed.init_process_group(
                    backend="ccl",
                    world_size=WORLD_SIZE,