From 5d97e0c4a3688ef462472167242c48570b8125c5 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 2 Jul 2024 17:56:07 +0800 Subject: [PATCH] fix FlashDecoding change's regression in intel platform (#2161) install triton because GPTQParams needs it. Signed-off-by: Wang, Yi A --- Dockerfile_intel | 2 ++ server/text_generation_server/layers/attention/ipex.py | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index a41fbc1e3..3c060f19d 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -62,6 +62,7 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ WORKDIR /usr/src RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl && pip install torch-2.1.0.post1+cxx11.abi-cp310-cp310-linux_x86_64.whl +RUN pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b distributed origin/dev/distributed # Install server @@ -132,6 +133,7 @@ RUN conda install -c conda-forge gperftools mkl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl +RUN pip install triton WORKDIR /usr/src diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py index db79c589e..45a0a03ec 100644 --- a/server/text_generation_server/layers/attention/ipex.py +++ b/server/text_generation_server/layers/attention/ipex.py @@ -1,6 +1,7 @@ import intel_extension_for_pytorch as ipex import torch from text_generation_server.models.flash_causal_lm import BLOCK_SIZE +from text_generation_server.layers.attention import Seqlen SUPPORTS_WINDOWING = False @@ -55,11 +56,10 @@ def paged_attention( kv_head_mapping: torch.Tensor, softmax_scale: float, block_tables: torch.Tensor, - cu_seqlen_q: torch.Tensor, - cu_seqlen_k: torch.Tensor, + seqlen: Seqlen, max_s: int, ): - return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( + ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( out, query, key_cache, @@ -67,8 +67,9 @@ def paged_attention( kv_head_mapping, softmax_scale, block_tables, - cu_seqlen_q, + seqlen.input_lengths, BLOCK_SIZE, max_s, None, ) + return out