update docker file

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-11 12:24:53 +00:00 · 2024-03-06 04:46:56 -08:00 · 2024-03-06 04:46:56 -08:00 · 02537ec663
commit 02537ec663
parent 23a1cb0511
4 changed files with 34 additions and 8 deletions
--- a/35
+++ b/35
@ -1,5 +1,4 @@
-# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
 FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@ -35,6 +34,7 @@ COPY router router
 COPY launcher launcher
 RUN cargo build --release
 # Text Generation Inference base image for Intel
 FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
@ -47,7 +47,7 @@ RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
-RUN apt-get update && apt install -y intel-basekit xpu-smi
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
@ -55,6 +55,11 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
    PORT=80
 WORKDIR /usr/src
 # Build pytorch and ipex
 RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
 RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
 # Install server
 COPY proto proto
 COPY server server
@ -62,7 +67,29 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_common.txt && \
-    pip install ".[accelerate, peft]" --no-cache-dir
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/2021.11
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.11
 ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
 ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/2024.0/etc/compiler/sys_check/sys_check.sh
 ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
 ENV MANPATH=/opt/intel/oneapi/mpi/2021.11/share/man:/opt/intel/oneapi/mpi/2021.11/share/man:/opt/intel/oneapi/compiler/2024.0/documentation/en/man/common:
 ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/2024.0/lib/cmake:/opt/intel/oneapi/compiler/2024.0
 ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/2024.0
 ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mkl/2024.0/lib/:/opt/intel/oneapi/compiler/2024.0/lib
 ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/2024.0/lib/libintelocl.so
 ENV CLASSPATH=/opt/intel/oneapi/mpi/2021.11/share/java/mpi.jar:/opt/intel/oneapi/mpi/2021.11/share/java/mpi.jar
 ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/2021.11/lib/:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.11/lib:/opt/intel/oneapi/mkl/2024.0/lib:/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.0/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
 ENV MKLROOT=/opt/intel/oneapi/mkl/2024.0
 ENV NLSPATH=/opt/intel/oneapi/mkl/2024.0/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/2024.0/lib/locale/%l_%t/%N
 ENV PATH=/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/2021.11/bin:/opt/intel/oneapi/mpi/2021.11/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/2024.0/bin/:/opt/intel/oneapi/compiler/2024.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV CPATH=/opt/intel/oneapi/mpi/2021.11/include:/opt/intel/oneapi/ccl/2021.11/include:/opt/intel/oneapi/mkl/2024.0/include
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
 RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@ -35,7 +35,7 @@ SLIDING_WINDOW: Optional[int] = None
 SLIDING_WINDOW_BLOCKS: Optional[int] = None
 from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
-MEM_POOL = torch.cuda.graph_pool_handle()
+MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 def set_sliding_window(sliding_window: int, sliding_window_blocks: int):
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -1,7 +1,7 @@
 import torch
 import os
-MEM_POOL = torch.cuda.graph_pool_handle()
+MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 # This is overridden by the cli
 cuda_graphs = os.getenv("CUDA_GRAPHS")
 if cuda_graphs is not None and cuda_graphs != "0":
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -883,11 +883,10 @@ try:
                residual = hidden_states
                out = ipex.llm.modules.RMSNorm.apply(
                    hidden_states,
                    [hidden_states.size(-1)],
                    self.weight,
                    self.variance_epsilon,
                )
-                return out[0], residual
+                return out, residual
            elif hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual