# Rust builder FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef AS planner COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY benchmark benchmark COPY router router COPY backends backends COPY launcher launcher RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ python3.11-dev RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ rm -f $PROTOC_ZIP COPY --from=planner /usr/src/recipe.json recipe.json RUN cargo chef cook --profile release-opt --recipe-path recipe.json ARG GIT_SHA ARG DOCKER_LABEL COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY benchmark benchmark COPY router router COPY backends backends COPY launcher launcher RUN cargo build --profile release-opt --frozen FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base ARG HIPBLASLT_BRANCH="4d40e36" ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG LEGACY_HIPBLASLT_OPTION= ARG RCCL_BRANCH="648a58d" ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG PYTORCH_BRANCH="3a585126" ARG PYTORCH_VISION_BRANCH="v0.19.1" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="b7d29fb" ARG FA_REPO="https://github.com/ROCm/flash-attention.git" ARG AITER_BRANCH="21d47a9" ARG AITER_REPO="https://github.com/ROCm/aiter.git" ENV PATH=/opt/rocm/llvm/bin:$PATH ENV ROCM_PATH=/opt/rocm ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ARG PYTHON_VERSION=3.11 RUN mkdir -p /app WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ ccache \ curl \ git \ ninja-build \ cmake \ software-properties-common \ python3.11-dev \ python3.11-venv && \ rm -rf /var/lib/apt/lists/* COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/ ENV PATH="$PATH:/root/.local/bin" RUN uv python install ${PYTHON_VERSION} RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging ENV VIRTUAL_ENV=/usr/src/.venv/ ENV PATH="$PATH:/usr/src/.venv/bin/" RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython FROM base AS build_hipblaslt ARG HIPBLASLT_BRANCH ARG HIPBLAS_COMMON_BRANCH # Set to "--legacy_hipblas_direct" for ROCm<=6.2 ARG LEGACY_HIPBLASLT_OPTION RUN git clone https://github.com/ROCm/hipBLAS-common.git RUN . .venv/bin/activate && cd hipBLAS-common \ && git checkout ${HIPBLAS_COMMON_BRANCH} \ && mkdir build \ && cd build \ && cmake .. \ && make package \ && dpkg -i ./*.deb RUN git clone https://github.com/ROCm/hipBLASLt RUN . .venv/bin/activate && cd hipBLASLt \ && git checkout ${HIPBLASLT_BRANCH} \ && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ && cd build/release \ && make package RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install FROM base AS build_rccl ARG RCCL_BRANCH ARG RCCL_REPO RUN git clone ${RCCL_REPO} RUN . .venv/bin/activate && cd rccl \ && git checkout ${RCCL_BRANCH} \ && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO RUN git clone ${TRITON_REPO} RUN . .venv/bin/activate && cd triton \ && git checkout ${TRITON_BRANCH} \ && cd python \ && python3 setup.py bdist_wheel --dist-dir=dist RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install FROM base AS build_amdsmi RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \ && pip wheel . --wheel-dir=dist RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install FROM base AS build_pytorch ARG PYTORCH_BRANCH ARG PYTORCH_VISION_BRANCH ARG PYTORCH_REPO ARG PYTORCH_VISION_REPO ARG FA_BRANCH ARG FA_REPO RUN git clone ${PYTORCH_REPO} pytorch RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \ pip install -r requirements.txt && git submodule update --init --recursive \ && python3 tools/amd_build/build_amd.py \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ && pip install dist/*.whl RUN git clone ${PYTORCH_VISION_REPO} vision RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ && python3 setup.py bdist_wheel --dist-dir=dist \ && pip install dist/*.whl RUN git clone ${FA_REPO} RUN . .venv/bin/activate && cd flash-attention \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \ && cp /app/flash-attention/dist/*.whl /app/install FROM base AS final RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ dpkg -i /install/*deb \ && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ dpkg -i /install/*deb \ && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ . .venv/bin/activate && \ pip install /install/*.whl RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ . .venv/bin/activate && \ pip install /install/*.whl RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ . .venv/bin/activate && \ pip install /install/*.whl ARG AITER_REPO ARG AITER_BRANCH RUN git clone --recursive ${AITER_REPO} RUN . .venv/bin/activate && cd aiter \ && git checkout ${AITER_BRANCH} \ && git submodule update --init --recursive \ && pip install -r requirements.txt \ && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter RUN rm -rf /var/lib/apt/lists/* FROM final AS kernel-builder # # Build vllm kernels FROM kernel-builder AS vllm-builder COPY server/Makefile-vllm Makefile RUN . .venv/bin/activate && pip install setuptools_scm # Build specific version of vllm RUN . .venv/bin/activate && make build-vllm-rocm # Build Transformers CUDA kernels (gpt-neox and bloom) FROM kernel-builder AS custom-kernels-builder COPY server/custom_kernels/ . RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist # Build exllama kernels FROM kernel-builder AS exllama-kernels-builder COPY server/exllama_kernels/ . RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist # Build exllama v2 kernels FROM kernel-builder AS exllamav2-kernels-builder COPY server/exllamav2_kernels/ . RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist FROM kernel-builder AS marlin-kernels ENV MARLIN_KERNELS_BRANCH=v0.3.6 ENV VLLM_TARGET_DEVICE=rocm RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \ cd marlin-kernels && \ git checkout ${MARLIN_KERNELS_BRANCH} && \ python3 setup.py bdist_wheel --dist-dir=dist FROM kernel-builder AS moe-kernels ENV MOE_KERNELS_BRANCH=v0.8.2 ENV VLLM_TARGET_DEVICE=rocm RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \ cd moe-kernels && \ git checkout ${MOE_KERNELS_BRANCH} && \ python3 setup.py bdist_wheel --dist-dir=dist FROM final AS base-copy # Text Generation Inference base env ENV HF_HOME=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ PORT=80 ENV VIRTUAL_ENV=/app/.venv/ ENV PATH="$PATH:/app/.venv/bin/" # Install server COPY proto proto COPY server server COPY server/Makefile server/Makefile RUN cd server && \ uv pip install grpcio-tools mypy-protobuf && \ uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \ make gen-server-raw RUN cd server && \ pwd && \ text-generation-server --help RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \ uv pip install /install/*.whl RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \ uv pip install /install/*.whl # Install benchmarker COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router # Install launcher COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher # AWS Sagemaker compatible image FROM base AS sagemaker COPY sagemaker-entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] # Final image FROM base-copy # Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm ENV HIP_FORCE_DEV_KERNARG=1 # On MI250 and MI300, performances for flash with Triton FA are slightly better than CK. # However, Triton requires a tunning for each prompt length, which is prohibitive. ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0 ENV ROCM_USE_CUSTOM_PAGED_ATTN=1 ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0 ENV VLLM_MOE_PADDING=0 ENV ATTENTION=paged ENV PREFIX_CACHING=0 ENV PREFILL_CHUNKING=0 ENV ROCM_USE_SKINNY_GEMM=1 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh RUN chmod +x /tgi-entrypoint.sh ENTRYPOINT ["/tgi-entrypoint.sh"] ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib" ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages # CMD ["--json-output"]