2023-11-27 13:08:12 +00:00
|
|
|
# Rust builder
|
2025-03-24 10:55:49 +00:00
|
|
|
FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
|
2023-11-27 13:08:12 +00:00
|
|
|
WORKDIR /usr/src
|
|
|
|
|
|
|
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
|
|
|
|
2024-07-03 10:48:45 +00:00
|
|
|
FROM chef AS planner
|
2024-06-24 16:16:36 +00:00
|
|
|
COPY Cargo.lock Cargo.lock
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY Cargo.toml Cargo.toml
|
|
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
|
|
COPY proto proto
|
|
|
|
COPY benchmark benchmark
|
|
|
|
COPY router router
|
2024-07-31 08:33:10 +00:00
|
|
|
COPY backends backends
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY launcher launcher
|
|
|
|
RUN cargo chef prepare --recipe-path recipe.json
|
|
|
|
|
|
|
|
FROM chef AS builder
|
|
|
|
|
2024-09-11 20:41:56 +00:00
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
|
|
python3.11-dev
|
2023-11-27 13:08:12 +00:00
|
|
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
|
|
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
|
|
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
|
|
|
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
|
|
|
rm -f $PROTOC_ZIP
|
|
|
|
|
|
|
|
COPY --from=planner /usr/src/recipe.json recipe.json
|
2024-06-05 10:18:38 +00:00
|
|
|
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
|
2023-11-27 13:08:12 +00:00
|
|
|
|
2024-06-06 16:51:42 +00:00
|
|
|
ARG GIT_SHA
|
|
|
|
ARG DOCKER_LABEL
|
|
|
|
|
2024-10-08 07:42:50 +00:00
|
|
|
COPY Cargo.lock Cargo.lock
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY Cargo.toml Cargo.toml
|
|
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
|
|
COPY proto proto
|
|
|
|
COPY benchmark benchmark
|
|
|
|
COPY router router
|
2024-07-31 08:33:10 +00:00
|
|
|
COPY backends backends
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY launcher launcher
|
2024-10-08 07:42:50 +00:00
|
|
|
RUN cargo build --profile release-opt --frozen
|
2023-11-27 13:08:12 +00:00
|
|
|
|
2025-03-26 16:13:55 +00:00
|
|
|
FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base
|
|
|
|
|
|
|
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
|
|
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
|
|
|
ARG LEGACY_HIPBLASLT_OPTION=
|
|
|
|
ARG RCCL_BRANCH="648a58d"
|
|
|
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
|
|
|
ARG TRITON_BRANCH="e5be006"
|
|
|
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
|
|
|
ARG PYTORCH_BRANCH="3a585126"
|
|
|
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
|
|
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
|
|
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
|
|
|
ARG FA_BRANCH="b7d29fb"
|
|
|
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
|
|
|
ARG AITER_BRANCH="21d47a9"
|
|
|
|
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
|
|
|
|
|
|
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
|
|
|
ENV ROCM_PATH=/opt/rocm
|
|
|
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
|
|
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
|
|
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
|
|
|
|
|
|
|
ARG PYTHON_VERSION=3.11
|
|
|
|
|
|
|
|
RUN mkdir -p /app
|
|
|
|
WORKDIR /app
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
|
|
|
|
|
|
# Install Python and other dependencies
|
2023-11-27 13:08:12 +00:00
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
2025-03-26 16:13:55 +00:00
|
|
|
build-essential \
|
|
|
|
ca-certificates \
|
|
|
|
ccache \
|
|
|
|
curl \
|
|
|
|
git \
|
|
|
|
ninja-build \
|
|
|
|
cmake \
|
|
|
|
software-properties-common \
|
|
|
|
python3.11-dev \
|
|
|
|
python3.11-venv && \
|
|
|
|
rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
|
|
|
|
ENV PATH="$PATH:/root/.local/bin"
|
|
|
|
RUN uv python install ${PYTHON_VERSION}
|
|
|
|
RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
|
|
|
|
ENV VIRTUAL_ENV=/usr/src/.venv/
|
|
|
|
ENV PATH="$PATH:/usr/src/.venv/bin/"
|
|
|
|
|
|
|
|
RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
|
|
|
|
2024-09-30 08:54:32 +00:00
|
|
|
FROM base AS build_hipblaslt
|
2025-03-26 16:13:55 +00:00
|
|
|
ARG HIPBLASLT_BRANCH
|
|
|
|
ARG HIPBLAS_COMMON_BRANCH
|
|
|
|
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
|
|
|
|
ARG LEGACY_HIPBLASLT_OPTION
|
|
|
|
RUN git clone https://github.com/ROCm/hipBLAS-common.git
|
|
|
|
RUN . .venv/bin/activate && cd hipBLAS-common \
|
|
|
|
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
|
|
|
|
&& mkdir build \
|
|
|
|
&& cd build \
|
|
|
|
&& cmake .. \
|
|
|
|
&& make package \
|
|
|
|
&& dpkg -i ./*.deb
|
|
|
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
|
|
|
RUN . .venv/bin/activate && cd hipBLASLt \
|
2024-09-30 08:54:32 +00:00
|
|
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
2025-03-26 16:13:55 +00:00
|
|
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
2024-09-30 08:54:32 +00:00
|
|
|
&& cd build/release \
|
|
|
|
&& make package
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
2024-09-30 08:54:32 +00:00
|
|
|
|
|
|
|
FROM base AS build_rccl
|
2025-03-26 16:13:55 +00:00
|
|
|
ARG RCCL_BRANCH
|
|
|
|
ARG RCCL_REPO
|
|
|
|
RUN git clone ${RCCL_REPO}
|
|
|
|
RUN . .venv/bin/activate && cd rccl \
|
2024-09-30 08:54:32 +00:00
|
|
|
&& git checkout ${RCCL_BRANCH} \
|
|
|
|
&& ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
|
2024-09-30 08:54:32 +00:00
|
|
|
|
|
|
|
FROM base AS build_triton
|
2025-03-26 16:13:55 +00:00
|
|
|
ARG TRITON_BRANCH
|
|
|
|
ARG TRITON_REPO
|
|
|
|
RUN git clone ${TRITON_REPO}
|
|
|
|
RUN . .venv/bin/activate && cd triton \
|
2024-09-30 08:54:32 +00:00
|
|
|
&& git checkout ${TRITON_BRANCH} \
|
|
|
|
&& cd python \
|
|
|
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
|
2024-09-30 08:54:32 +00:00
|
|
|
|
|
|
|
FROM base AS build_amdsmi
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
|
2024-09-30 08:54:32 +00:00
|
|
|
&& pip wheel . --wheel-dir=dist
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
|
|
|
|
|
|
|
|
FROM base AS build_pytorch
|
|
|
|
ARG PYTORCH_BRANCH
|
|
|
|
ARG PYTORCH_VISION_BRANCH
|
|
|
|
ARG PYTORCH_REPO
|
|
|
|
ARG PYTORCH_VISION_REPO
|
|
|
|
ARG FA_BRANCH
|
|
|
|
ARG FA_REPO
|
|
|
|
RUN git clone ${PYTORCH_REPO} pytorch
|
|
|
|
RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
|
|
|
|
pip install -r requirements.txt && git submodule update --init --recursive \
|
|
|
|
&& python3 tools/amd_build/build_amd.py \
|
|
|
|
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
|
|
|
&& pip install dist/*.whl
|
|
|
|
RUN git clone ${PYTORCH_VISION_REPO} vision
|
|
|
|
RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
|
|
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
|
|
|
&& pip install dist/*.whl
|
|
|
|
RUN git clone ${FA_REPO}
|
|
|
|
RUN . .venv/bin/activate && cd flash-attention \
|
|
|
|
&& git checkout ${FA_BRANCH} \
|
|
|
|
&& git submodule update --init \
|
|
|
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
|
|
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
|
|
|
&& cp /app/vision/dist/*.whl /app/install \
|
|
|
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
|
|
|
|
|
|
|
FROM base AS final
|
|
|
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
|
|
|
dpkg -i /install/*deb \
|
|
|
|
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
|
|
|
|
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
|
|
|
|
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
|
|
|
|
dpkg -i /install/*deb \
|
|
|
|
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
|
|
|
|
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
|
|
|
|
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
|
|
|
. .venv/bin/activate && \
|
|
|
|
pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
|
|
|
. .venv/bin/activate && \
|
|
|
|
pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
|
|
|
. .venv/bin/activate && \
|
|
|
|
pip install /install/*.whl
|
|
|
|
|
|
|
|
ARG AITER_REPO
|
|
|
|
ARG AITER_BRANCH
|
|
|
|
RUN git clone --recursive ${AITER_REPO}
|
|
|
|
RUN . .venv/bin/activate && cd aiter \
|
|
|
|
&& git checkout ${AITER_BRANCH} \
|
|
|
|
&& git submodule update --init --recursive \
|
|
|
|
&& pip install -r requirements.txt \
|
|
|
|
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
|
|
|
|
|
|
|
RUN rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
FROM final AS kernel-builder
|
MI300 compatibility (#1764)
Adds support for AMD Instinct MI300 in TGI.
Most changes are:
* Support PyTorch TunableOp to pick the GEMM/GEMV kernels for decoding
https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable.
TunableOp is disabled by default, and can be enabled with
`PYTORCH_TUNABLEOP_ENABLED=1`.
* Update ROCm dockerfile to PyTorch 2.3 (actually patched with changes
from https://github.com/pytorch/pytorch/pull/124362)
* Support SILU & Linear custom kernels contributed by AMD
* Update vLLM paged attention to https://github.com/fxmarty/rocm-vllm/,
branching out of a much more recent commit
https://github.com/ROCm/vllm/commit/3489ce7936c5de588916ae3047c44c23c0b0c308
* Support FA2 Triton kernel as recommended by AMD. Can be used by
specifying `ROCM_USE_FLASH_ATTN_V2_TRITON=1`.
* Update dockerfile to ROCm 6.1
By default, TunableOp tuning results are saved in `/data` (e.g.
`/data/tunableop_meta-llama-Llama-2-70b-chat-hf_tp1_rank0.csv`) in order
to avoid to have to rerun the tuning at each `docker run`.
Example:
```
Validator,PT_VERSION,2.3.0
Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c
Validator,HIPBLASLT_VERSION,0.7.0-1549b021
Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
GemmTunableOp_Half_TN,tn_8192_7_28672,Gemm_Rocblas_45475,0.132098
GemmTunableOp_Half_TN,tn_10240_4_8192,Gemm_Rocblas_45546,0.0484431
GemmTunableOp_Half_TN,tn_32000_6_8192,Default,0.149546
GemmTunableOp_Half_TN,tn_32000_3_8192,Gemm_Rocblas_45520,0.147119
GemmTunableOp_Half_TN,tn_8192_3_28672,Gemm_Rocblas_45475,0.132645
GemmTunableOp_Half_TN,tn_10240_3_8192,Gemm_Rocblas_45546,0.0482971
GemmTunableOp_Half_TN,tn_57344_5_8192,Gemm_Rocblas_45520,0.255694
GemmTunableOp_Half_TN,tn_10240_7_8192,Gemm_Rocblas_45517,0.0482522
GemmTunableOp_Half_TN,tn_8192_3_8192,Gemm_Rocblas_45546,0.0444671
GemmTunableOp_Half_TN,tn_8192_5_8192,Gemm_Rocblas_45546,0.0445834
GemmTunableOp_Half_TN,tn_57344_7_8192,Gemm_Rocblas_45520,0.25622
GemmTunableOp_Half_TN,tn_8192_2_28672,Gemm_Rocblas_45475,0.132122
GemmTunableOp_Half_TN,tn_8192_4_8192,Gemm_Rocblas_45517,0.0453191
GemmTunableOp_Half_TN,tn_10240_5_8192,Gemm_Rocblas_45517,0.0482514
GemmTunableOp_Half_TN,tn_8192_5_28672,Gemm_Rocblas_45542,0.133914
GemmTunableOp_Half_TN,tn_8192_2_8192,Gemm_Rocblas_45517,0.0446516
GemmTunableOp_Half_TN,tn_8192_1_28672,Gemm_Hipblaslt_TN_10814,0.131953
GemmTunableOp_Half_TN,tn_10240_2_8192,Gemm_Rocblas_45546,0.0481043
GemmTunableOp_Half_TN,tn_32000_4_8192,Gemm_Rocblas_45520,0.147497
GemmTunableOp_Half_TN,tn_8192_6_28672,Gemm_Rocblas_45529,0.134895
GemmTunableOp_Half_TN,tn_57344_2_8192,Gemm_Rocblas_45520,0.254716
GemmTunableOp_Half_TN,tn_57344_4_8192,Gemm_Rocblas_45520,0.255731
GemmTunableOp_Half_TN,tn_10240_6_8192,Gemm_Rocblas_45517,0.0484816
GemmTunableOp_Half_TN,tn_57344_3_8192,Gemm_Rocblas_45520,0.254701
GemmTunableOp_Half_TN,tn_8192_4_28672,Gemm_Rocblas_45475,0.132159
GemmTunableOp_Half_TN,tn_32000_2_8192,Default,0.147524
GemmTunableOp_Half_TN,tn_32000_5_8192,Default,0.147074
GemmTunableOp_Half_TN,tn_8192_6_8192,Gemm_Rocblas_45546,0.0454045
GemmTunableOp_Half_TN,tn_57344_6_8192,Gemm_Rocblas_45520,0.255582
GemmTunableOp_Half_TN,tn_32000_7_8192,Default,0.146705
GemmTunableOp_Half_TN,tn_8192_7_8192,Gemm_Rocblas_45546,0.0445489
```
---------
Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com>
2024-05-17 13:30:47 +00:00
|
|
|
# # Build vllm kernels
|
2023-11-27 13:08:12 +00:00
|
|
|
FROM kernel-builder AS vllm-builder
|
|
|
|
|
|
|
|
COPY server/Makefile-vllm Makefile
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && pip install setuptools_scm
|
2023-11-27 13:08:12 +00:00
|
|
|
|
|
|
|
# Build specific version of vllm
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && make build-vllm-rocm
|
2023-11-27 13:08:12 +00:00
|
|
|
|
|
|
|
# Build Transformers CUDA kernels (gpt-neox and bloom)
|
2024-07-03 10:48:45 +00:00
|
|
|
FROM kernel-builder AS custom-kernels-builder
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY server/custom_kernels/ .
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
|
2023-11-27 13:08:12 +00:00
|
|
|
|
2024-01-26 15:27:44 +00:00
|
|
|
# Build exllama kernels
|
2024-07-03 10:48:45 +00:00
|
|
|
FROM kernel-builder AS exllama-kernels-builder
|
2024-01-26 15:27:44 +00:00
|
|
|
COPY server/exllama_kernels/ .
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
|
2024-01-26 15:27:44 +00:00
|
|
|
|
|
|
|
# Build exllama v2 kernels
|
2024-07-03 10:48:45 +00:00
|
|
|
FROM kernel-builder AS exllamav2-kernels-builder
|
2024-01-26 15:27:44 +00:00
|
|
|
COPY server/exllamav2_kernels/ .
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
|
2024-01-26 15:27:44 +00:00
|
|
|
|
2025-01-15 06:08:58 +00:00
|
|
|
FROM kernel-builder AS marlin-kernels
|
|
|
|
ENV MARLIN_KERNELS_BRANCH=v0.3.6
|
|
|
|
ENV VLLM_TARGET_DEVICE=rocm
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
|
2025-01-15 06:08:58 +00:00
|
|
|
cd marlin-kernels && \
|
|
|
|
git checkout ${MARLIN_KERNELS_BRANCH} && \
|
2025-03-26 16:13:55 +00:00
|
|
|
python3 setup.py bdist_wheel --dist-dir=dist
|
2025-01-15 06:08:58 +00:00
|
|
|
|
2024-12-18 11:44:42 +00:00
|
|
|
FROM kernel-builder AS moe-kernels
|
2025-01-31 10:40:00 +00:00
|
|
|
ENV MOE_KERNELS_BRANCH=v0.8.2
|
2024-12-18 11:44:42 +00:00
|
|
|
ENV VLLM_TARGET_DEVICE=rocm
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
|
2024-12-18 11:44:42 +00:00
|
|
|
cd moe-kernels && \
|
|
|
|
git checkout ${MOE_KERNELS_BRANCH} && \
|
2025-03-26 16:13:55 +00:00
|
|
|
python3 setup.py bdist_wheel --dist-dir=dist
|
2024-12-18 11:44:42 +00:00
|
|
|
|
2025-03-26 16:13:55 +00:00
|
|
|
FROM final AS base-copy
|
2023-11-27 13:08:12 +00:00
|
|
|
|
|
|
|
# Text Generation Inference base env
|
2024-08-09 12:25:44 +00:00
|
|
|
ENV HF_HOME=/data \
|
2023-11-27 13:08:12 +00:00
|
|
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
|
|
PORT=80
|
|
|
|
|
2025-03-26 16:13:55 +00:00
|
|
|
ENV VIRTUAL_ENV=/app/.venv/
|
|
|
|
ENV PATH="$PATH:/app/.venv/bin/"
|
2025-03-28 16:17:13 +00:00
|
|
|
RUN uv pip install kernels
|
2024-12-18 11:44:42 +00:00
|
|
|
|
2023-11-27 13:08:12 +00:00
|
|
|
# Install server
|
|
|
|
COPY proto proto
|
|
|
|
COPY server server
|
|
|
|
COPY server/Makefile server/Makefile
|
|
|
|
RUN cd server && \
|
2025-03-26 16:13:55 +00:00
|
|
|
uv pip install grpcio-tools mypy-protobuf && \
|
2025-03-28 16:17:13 +00:00
|
|
|
uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
|
|
|
|
make gen-server-raw && \
|
|
|
|
kernels download .
|
|
|
|
|
2025-02-06 11:28:24 +00:00
|
|
|
RUN cd server && \
|
|
|
|
pwd && \
|
|
|
|
text-generation-server --help
|
2023-11-27 13:08:12 +00:00
|
|
|
|
2025-03-26 16:13:55 +00:00
|
|
|
RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
|
|
|
|
uv pip install /install/*.whl
|
|
|
|
|
2025-03-28 16:17:13 +00:00
|
|
|
|
2023-11-27 13:08:12 +00:00
|
|
|
# Install benchmarker
|
2024-06-05 10:18:38 +00:00
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
2023-11-27 13:08:12 +00:00
|
|
|
# Install router
|
2024-06-05 10:18:38 +00:00
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
|
2023-11-27 13:08:12 +00:00
|
|
|
# Install launcher
|
2024-06-05 10:18:38 +00:00
|
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
|
2023-11-27 13:08:12 +00:00
|
|
|
|
|
|
|
# AWS Sagemaker compatible image
|
2024-07-03 10:48:45 +00:00
|
|
|
FROM base AS sagemaker
|
MI300 compatibility (#1764)
Adds support for AMD Instinct MI300 in TGI.
Most changes are:
* Support PyTorch TunableOp to pick the GEMM/GEMV kernels for decoding
https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable.
TunableOp is disabled by default, and can be enabled with
`PYTORCH_TUNABLEOP_ENABLED=1`.
* Update ROCm dockerfile to PyTorch 2.3 (actually patched with changes
from https://github.com/pytorch/pytorch/pull/124362)
* Support SILU & Linear custom kernels contributed by AMD
* Update vLLM paged attention to https://github.com/fxmarty/rocm-vllm/,
branching out of a much more recent commit
https://github.com/ROCm/vllm/commit/3489ce7936c5de588916ae3047c44c23c0b0c308
* Support FA2 Triton kernel as recommended by AMD. Can be used by
specifying `ROCM_USE_FLASH_ATTN_V2_TRITON=1`.
* Update dockerfile to ROCm 6.1
By default, TunableOp tuning results are saved in `/data` (e.g.
`/data/tunableop_meta-llama-Llama-2-70b-chat-hf_tp1_rank0.csv`) in order
to avoid to have to rerun the tuning at each `docker run`.
Example:
```
Validator,PT_VERSION,2.3.0
Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c
Validator,HIPBLASLT_VERSION,0.7.0-1549b021
Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
GemmTunableOp_Half_TN,tn_8192_7_28672,Gemm_Rocblas_45475,0.132098
GemmTunableOp_Half_TN,tn_10240_4_8192,Gemm_Rocblas_45546,0.0484431
GemmTunableOp_Half_TN,tn_32000_6_8192,Default,0.149546
GemmTunableOp_Half_TN,tn_32000_3_8192,Gemm_Rocblas_45520,0.147119
GemmTunableOp_Half_TN,tn_8192_3_28672,Gemm_Rocblas_45475,0.132645
GemmTunableOp_Half_TN,tn_10240_3_8192,Gemm_Rocblas_45546,0.0482971
GemmTunableOp_Half_TN,tn_57344_5_8192,Gemm_Rocblas_45520,0.255694
GemmTunableOp_Half_TN,tn_10240_7_8192,Gemm_Rocblas_45517,0.0482522
GemmTunableOp_Half_TN,tn_8192_3_8192,Gemm_Rocblas_45546,0.0444671
GemmTunableOp_Half_TN,tn_8192_5_8192,Gemm_Rocblas_45546,0.0445834
GemmTunableOp_Half_TN,tn_57344_7_8192,Gemm_Rocblas_45520,0.25622
GemmTunableOp_Half_TN,tn_8192_2_28672,Gemm_Rocblas_45475,0.132122
GemmTunableOp_Half_TN,tn_8192_4_8192,Gemm_Rocblas_45517,0.0453191
GemmTunableOp_Half_TN,tn_10240_5_8192,Gemm_Rocblas_45517,0.0482514
GemmTunableOp_Half_TN,tn_8192_5_28672,Gemm_Rocblas_45542,0.133914
GemmTunableOp_Half_TN,tn_8192_2_8192,Gemm_Rocblas_45517,0.0446516
GemmTunableOp_Half_TN,tn_8192_1_28672,Gemm_Hipblaslt_TN_10814,0.131953
GemmTunableOp_Half_TN,tn_10240_2_8192,Gemm_Rocblas_45546,0.0481043
GemmTunableOp_Half_TN,tn_32000_4_8192,Gemm_Rocblas_45520,0.147497
GemmTunableOp_Half_TN,tn_8192_6_28672,Gemm_Rocblas_45529,0.134895
GemmTunableOp_Half_TN,tn_57344_2_8192,Gemm_Rocblas_45520,0.254716
GemmTunableOp_Half_TN,tn_57344_4_8192,Gemm_Rocblas_45520,0.255731
GemmTunableOp_Half_TN,tn_10240_6_8192,Gemm_Rocblas_45517,0.0484816
GemmTunableOp_Half_TN,tn_57344_3_8192,Gemm_Rocblas_45520,0.254701
GemmTunableOp_Half_TN,tn_8192_4_28672,Gemm_Rocblas_45475,0.132159
GemmTunableOp_Half_TN,tn_32000_2_8192,Default,0.147524
GemmTunableOp_Half_TN,tn_32000_5_8192,Default,0.147074
GemmTunableOp_Half_TN,tn_8192_6_8192,Gemm_Rocblas_45546,0.0454045
GemmTunableOp_Half_TN,tn_57344_6_8192,Gemm_Rocblas_45520,0.255582
GemmTunableOp_Half_TN,tn_32000_7_8192,Default,0.146705
GemmTunableOp_Half_TN,tn_8192_7_8192,Gemm_Rocblas_45546,0.0445489
```
---------
Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com>
2024-05-17 13:30:47 +00:00
|
|
|
|
2023-11-27 13:08:12 +00:00
|
|
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
|
|
|
RUN chmod +x entrypoint.sh
|
|
|
|
|
|
|
|
ENTRYPOINT ["./entrypoint.sh"]
|
|
|
|
|
|
|
|
# Final image
|
|
|
|
FROM base-copy
|
|
|
|
|
2024-09-30 08:54:32 +00:00
|
|
|
# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
|
|
|
|
ENV HIP_FORCE_DEV_KERNARG=1
|
|
|
|
|
|
|
|
# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
|
|
|
|
# However, Triton requires a tunning for each prompt length, which is prohibitive.
|
|
|
|
ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
|
|
|
|
ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
|
|
|
|
ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
|
|
|
|
ENV VLLM_MOE_PADDING=0
|
|
|
|
ENV ATTENTION=paged
|
2024-10-16 10:49:33 +00:00
|
|
|
ENV PREFIX_CACHING=0
|
|
|
|
ENV PREFILL_CHUNKING=0
|
2024-09-30 08:54:32 +00:00
|
|
|
ENV ROCM_USE_SKINNY_GEMM=1
|
2025-03-28 16:17:13 +00:00
|
|
|
ENV PYTORCH_TUNABLEOP_ENABLED=0
|
MI300 compatibility (#1764)
Adds support for AMD Instinct MI300 in TGI.
Most changes are:
* Support PyTorch TunableOp to pick the GEMM/GEMV kernels for decoding
https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable.
TunableOp is disabled by default, and can be enabled with
`PYTORCH_TUNABLEOP_ENABLED=1`.
* Update ROCm dockerfile to PyTorch 2.3 (actually patched with changes
from https://github.com/pytorch/pytorch/pull/124362)
* Support SILU & Linear custom kernels contributed by AMD
* Update vLLM paged attention to https://github.com/fxmarty/rocm-vllm/,
branching out of a much more recent commit
https://github.com/ROCm/vllm/commit/3489ce7936c5de588916ae3047c44c23c0b0c308
* Support FA2 Triton kernel as recommended by AMD. Can be used by
specifying `ROCM_USE_FLASH_ATTN_V2_TRITON=1`.
* Update dockerfile to ROCm 6.1
By default, TunableOp tuning results are saved in `/data` (e.g.
`/data/tunableop_meta-llama-Llama-2-70b-chat-hf_tp1_rank0.csv`) in order
to avoid to have to rerun the tuning at each `docker run`.
Example:
```
Validator,PT_VERSION,2.3.0
Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c
Validator,HIPBLASLT_VERSION,0.7.0-1549b021
Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
GemmTunableOp_Half_TN,tn_8192_7_28672,Gemm_Rocblas_45475,0.132098
GemmTunableOp_Half_TN,tn_10240_4_8192,Gemm_Rocblas_45546,0.0484431
GemmTunableOp_Half_TN,tn_32000_6_8192,Default,0.149546
GemmTunableOp_Half_TN,tn_32000_3_8192,Gemm_Rocblas_45520,0.147119
GemmTunableOp_Half_TN,tn_8192_3_28672,Gemm_Rocblas_45475,0.132645
GemmTunableOp_Half_TN,tn_10240_3_8192,Gemm_Rocblas_45546,0.0482971
GemmTunableOp_Half_TN,tn_57344_5_8192,Gemm_Rocblas_45520,0.255694
GemmTunableOp_Half_TN,tn_10240_7_8192,Gemm_Rocblas_45517,0.0482522
GemmTunableOp_Half_TN,tn_8192_3_8192,Gemm_Rocblas_45546,0.0444671
GemmTunableOp_Half_TN,tn_8192_5_8192,Gemm_Rocblas_45546,0.0445834
GemmTunableOp_Half_TN,tn_57344_7_8192,Gemm_Rocblas_45520,0.25622
GemmTunableOp_Half_TN,tn_8192_2_28672,Gemm_Rocblas_45475,0.132122
GemmTunableOp_Half_TN,tn_8192_4_8192,Gemm_Rocblas_45517,0.0453191
GemmTunableOp_Half_TN,tn_10240_5_8192,Gemm_Rocblas_45517,0.0482514
GemmTunableOp_Half_TN,tn_8192_5_28672,Gemm_Rocblas_45542,0.133914
GemmTunableOp_Half_TN,tn_8192_2_8192,Gemm_Rocblas_45517,0.0446516
GemmTunableOp_Half_TN,tn_8192_1_28672,Gemm_Hipblaslt_TN_10814,0.131953
GemmTunableOp_Half_TN,tn_10240_2_8192,Gemm_Rocblas_45546,0.0481043
GemmTunableOp_Half_TN,tn_32000_4_8192,Gemm_Rocblas_45520,0.147497
GemmTunableOp_Half_TN,tn_8192_6_28672,Gemm_Rocblas_45529,0.134895
GemmTunableOp_Half_TN,tn_57344_2_8192,Gemm_Rocblas_45520,0.254716
GemmTunableOp_Half_TN,tn_57344_4_8192,Gemm_Rocblas_45520,0.255731
GemmTunableOp_Half_TN,tn_10240_6_8192,Gemm_Rocblas_45517,0.0484816
GemmTunableOp_Half_TN,tn_57344_3_8192,Gemm_Rocblas_45520,0.254701
GemmTunableOp_Half_TN,tn_8192_4_28672,Gemm_Rocblas_45475,0.132159
GemmTunableOp_Half_TN,tn_32000_2_8192,Default,0.147524
GemmTunableOp_Half_TN,tn_32000_5_8192,Default,0.147074
GemmTunableOp_Half_TN,tn_8192_6_8192,Gemm_Rocblas_45546,0.0454045
GemmTunableOp_Half_TN,tn_57344_6_8192,Gemm_Rocblas_45520,0.255582
GemmTunableOp_Half_TN,tn_32000_7_8192,Default,0.146705
GemmTunableOp_Half_TN,tn_8192_7_8192,Gemm_Rocblas_45546,0.0445489
```
---------
Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com>
2024-05-17 13:30:47 +00:00
|
|
|
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
|
|
|
|
RUN chmod +x /tgi-entrypoint.sh
|
|
|
|
|
|
|
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
2025-03-26 16:13:55 +00:00
|
|
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
|
|
|
|
ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
|
|
|
|
# CMD ["--json-output"]
|