2025-02-28 11:14:58 +00:00
|
|
|
# Those arguments are required to build the image
|
2025-03-18 08:45:52 +00:00
|
|
|
ARG HABANA_VERSION=1.20.0
|
|
|
|
ARG PYTORCH_VERSION=2.6.0
|
2025-02-28 11:14:58 +00:00
|
|
|
|
|
|
|
# Rust builder
|
2025-03-24 10:55:49 +00:00
|
|
|
FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
|
2025-02-28 11:14:58 +00:00
|
|
|
WORKDIR /usr/src
|
|
|
|
|
|
|
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
|
|
|
|
|
|
|
FROM chef AS planner
|
|
|
|
COPY Cargo.lock Cargo.lock
|
|
|
|
COPY Cargo.toml Cargo.toml
|
|
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
|
|
COPY proto proto
|
|
|
|
COPY benchmark benchmark
|
|
|
|
COPY router router
|
|
|
|
COPY backends backends
|
|
|
|
COPY launcher launcher
|
|
|
|
RUN cargo chef prepare --recipe-path recipe.json
|
|
|
|
|
|
|
|
FROM chef AS builder
|
|
|
|
|
|
|
|
ENV PYO3_PYTHON="/root/.local/bin/python" \
|
|
|
|
PYTHON_SYS_EXECUTABLE="/root/.local/bin/python" \
|
|
|
|
PYO3_PYTHON_VERSION="3.10"
|
|
|
|
|
|
|
|
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
|
|
|
|
&& . $HOME/.local/bin/env \
|
|
|
|
&& uv python install 3.10 --default --preview \
|
|
|
|
&& test -f /root/.local/bin/python || (echo "Python 3.10 not found at /root/.local/bin/python" && exit 1)
|
|
|
|
|
|
|
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
|
|
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
|
|
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
|
|
|
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
|
|
|
rm -f $PROTOC_ZIP
|
|
|
|
|
|
|
|
COPY --from=planner /usr/src/recipe.json recipe.json
|
|
|
|
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
|
|
|
|
|
|
|
|
ARG GIT_SHA
|
|
|
|
ARG DOCKER_LABEL
|
|
|
|
|
|
|
|
COPY Cargo.toml Cargo.toml
|
|
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
|
|
COPY proto proto
|
|
|
|
COPY benchmark benchmark
|
|
|
|
COPY router router
|
|
|
|
COPY backends backends
|
|
|
|
COPY launcher launcher
|
|
|
|
RUN cargo build --profile release-opt
|
|
|
|
|
|
|
|
# Text Generation Inference base image
|
|
|
|
ARG HABANA_VERSION
|
|
|
|
ARG PYTORCH_VERSION
|
|
|
|
|
|
|
|
FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base
|
|
|
|
|
|
|
|
ENV ATTENTION=default
|
|
|
|
ENV PREFIX_CACHING=0
|
|
|
|
ENV PREFILL_CHUNKING=0
|
|
|
|
|
|
|
|
# Text Generation Inference base env
|
|
|
|
ENV HF_HOME=/data \
|
|
|
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
|
|
PORT=80
|
|
|
|
|
|
|
|
# Assert that Python 3.10 is installed as the launcher is compiled with Python 3.10
|
|
|
|
RUN python3.10 --version || (echo "Python 3.10 is not installed" && exit 1)
|
|
|
|
|
|
|
|
# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
|
|
|
|
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
|
|
|
|
dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
|
|
|
|
|
|
|
WORKDIR /usr/src
|
|
|
|
|
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
|
|
libssl-dev \
|
|
|
|
ca-certificates \
|
|
|
|
make \
|
|
|
|
curl \
|
|
|
|
git \
|
|
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
# Install server
|
|
|
|
COPY proto proto
|
|
|
|
COPY backends/gaudi/server server
|
|
|
|
COPY backends/gaudi/server/Makefile server/Makefile
|
|
|
|
ARG HABANA_VERSION
|
|
|
|
RUN cd server && \
|
|
|
|
make gen-server && \
|
|
|
|
pip install --no-deps -r requirements.txt && \
|
|
|
|
bash ./dill-0.3.8-patch.sh && \
|
|
|
|
pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
|
|
|
|
BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
|
|
|
|
pip install . --no-cache-dir
|
Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113)
* clean cuda/rocm code in hpu backend, enable flat_hpu
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix TP in pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust block table in hpu to improve performance
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable all the model. not testet yet
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* use tensor cache in hpu graph to avoid replay issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add moe support, fix qwen/mistral/mixtral crash
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix phimoe issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* gpt_bigcode could also go pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable dbrx remove some unused code
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* multi-modality initial PR
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust warmup and enable vlm
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix incorrect output in qwen2 idefics if hpu graph is used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove unused quantization code and enable awq/gptq int4
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix gptq issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable fp8
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup prefill
remove model where pageattn is not used, set block table to None since it's not used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add warmup_decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove block_tables and prefill_cache_indices which will lead to dynamic shape
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix comment
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* missing gptj change...
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix some issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove torch.where to fix incorrect output in hpu graph model
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* match the latest vllm_extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---------
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-04-14 13:58:13 +00:00
|
|
|
RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git
|
2025-02-28 11:14:58 +00:00
|
|
|
# Install benchmarker
|
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
|
|
|
# Install router
|
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
|
|
|
|
# Install launcher
|
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
|
|
|
|
|
|
|
|
|
|
|
|
# AWS Sagemaker compatible image
|
|
|
|
FROM base AS sagemaker
|
|
|
|
|
|
|
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
|
|
|
RUN chmod +x entrypoint.sh
|
|
|
|
|
|
|
|
ENTRYPOINT ["./entrypoint.sh"]
|
|
|
|
|
|
|
|
# Final image
|
|
|
|
FROM base
|
|
|
|
|
|
|
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
|
|
|
ENV HABANA_VISIBLE_DEVICES all
|
|
|
|
ENV OMPI_MCA_btl_vader_single_copy_mechanism NONE
|
|
|
|
|
|
|
|
COPY backends/gaudi/tgi-entrypoint.sh /tgi-entrypoint.sh
|
|
|
|
RUN chmod +x /tgi-entrypoint.sh
|
|
|
|
|
|
|
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
|
|
|
CMD ["--json-output"]
|