mirror of
				https://github.com/huggingface/text-generation-inference.git
				synced 2025-10-20 20:35:24 +00:00 
			
		
		
		
	* update dockerfile * add updated makefile * fix docker * Lint. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
		
			
				
	
	
		
			315 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			315 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| # Rust builder
 | |
| FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 | |
| WORKDIR /usr/src
 | |
| 
 | |
| ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
 | |
| 
 | |
| FROM chef AS planner
 | |
| COPY Cargo.lock Cargo.lock
 | |
| COPY Cargo.toml Cargo.toml
 | |
| COPY rust-toolchain.toml rust-toolchain.toml
 | |
| COPY proto proto
 | |
| COPY benchmark benchmark
 | |
| COPY router router
 | |
| COPY backends backends
 | |
| COPY launcher launcher
 | |
| RUN cargo chef prepare --recipe-path recipe.json
 | |
| 
 | |
| FROM chef AS builder
 | |
| 
 | |
| RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 | |
|     python3.11-dev
 | |
| RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
 | |
|     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
 | |
|     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
 | |
|     unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
 | |
|     rm -f $PROTOC_ZIP
 | |
| 
 | |
| COPY --from=planner /usr/src/recipe.json recipe.json
 | |
| RUN cargo chef cook --profile release-opt --recipe-path recipe.json
 | |
| 
 | |
| ARG GIT_SHA
 | |
| ARG DOCKER_LABEL
 | |
| 
 | |
| COPY Cargo.lock Cargo.lock
 | |
| COPY Cargo.toml Cargo.toml
 | |
| COPY rust-toolchain.toml rust-toolchain.toml
 | |
| COPY proto proto
 | |
| COPY benchmark benchmark
 | |
| COPY router router
 | |
| COPY backends backends
 | |
| COPY launcher launcher
 | |
| RUN cargo build --profile release-opt --frozen
 | |
| 
 | |
| FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base
 | |
| 
 | |
| ARG HIPBLASLT_BRANCH="4d40e36"
 | |
| ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 | |
| ARG LEGACY_HIPBLASLT_OPTION=
 | |
| ARG RCCL_BRANCH="648a58d"
 | |
| ARG RCCL_REPO="https://github.com/ROCm/rccl"
 | |
| ARG TRITON_BRANCH="e5be006"
 | |
| ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
 | |
| ARG PYTORCH_BRANCH="3a585126"
 | |
| ARG PYTORCH_VISION_BRANCH="v0.19.1"
 | |
| ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 | |
| ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 | |
| ARG FA_BRANCH="b7d29fb"
 | |
| ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
 | |
| ARG AITER_BRANCH="21d47a9"
 | |
| ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 | |
| 
 | |
| ENV PATH=/opt/rocm/llvm/bin:$PATH
 | |
| ENV ROCM_PATH=/opt/rocm
 | |
| ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
 | |
| ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 | |
| ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 | |
| 
 | |
| ARG PYTHON_VERSION=3.11
 | |
| 
 | |
| RUN mkdir -p /app
 | |
| WORKDIR /app
 | |
| ENV DEBIAN_FRONTEND=noninteractive
 | |
| 
 | |
| # Install Python and other dependencies
 | |
| RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 | |
|         build-essential \
 | |
|         ca-certificates \
 | |
|         ccache \
 | |
|         curl \
 | |
|         git \
 | |
|         ninja-build \
 | |
|         cmake \
 | |
|         software-properties-common \
 | |
|         python3.11-dev \
 | |
|         python3.11-venv && \
 | |
|         rm -rf /var/lib/apt/lists/*
 | |
| 
 | |
| COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 | |
| ENV PATH="$PATH:/root/.local/bin"
 | |
| RUN uv python install ${PYTHON_VERSION}
 | |
| RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
 | |
| ENV VIRTUAL_ENV=/usr/src/.venv/
 | |
| ENV PATH="$PATH:/usr/src/.venv/bin/"
 | |
| 
 | |
| RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
 | |
| 
 | |
| FROM base AS build_hipblaslt
 | |
| ARG HIPBLASLT_BRANCH
 | |
| ARG HIPBLAS_COMMON_BRANCH
 | |
| # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 | |
| ARG LEGACY_HIPBLASLT_OPTION
 | |
| RUN git clone https://github.com/ROCm/hipBLAS-common.git
 | |
| RUN . .venv/bin/activate && cd hipBLAS-common \
 | |
|     && git checkout ${HIPBLAS_COMMON_BRANCH} \
 | |
|     && mkdir build \
 | |
|     && cd build \
 | |
|     && cmake .. \
 | |
|     && make package \
 | |
|     && dpkg -i ./*.deb
 | |
| RUN git clone https://github.com/ROCm/hipBLASLt
 | |
| RUN . .venv/bin/activate && cd hipBLASLt \
 | |
|     && git checkout ${HIPBLASLT_BRANCH} \
 | |
|     && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
 | |
|     && cd build/release \
 | |
|     && make package
 | |
| RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 | |
| 
 | |
| FROM base AS build_rccl
 | |
| ARG RCCL_BRANCH
 | |
| ARG RCCL_REPO
 | |
| RUN git clone ${RCCL_REPO}
 | |
| RUN . .venv/bin/activate && cd rccl \
 | |
|     && git checkout ${RCCL_BRANCH} \
 | |
|     && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
 | |
| RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
 | |
| 
 | |
| FROM base AS build_triton
 | |
| ARG TRITON_BRANCH
 | |
| ARG TRITON_REPO
 | |
| RUN git clone ${TRITON_REPO}
 | |
| RUN . .venv/bin/activate && cd triton \
 | |
|     && git checkout ${TRITON_BRANCH} \
 | |
|     && cd python \
 | |
|     && python3 setup.py bdist_wheel --dist-dir=dist
 | |
| RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
 | |
| 
 | |
| FROM base AS build_amdsmi
 | |
| RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
 | |
|     && pip wheel . --wheel-dir=dist
 | |
| RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 | |
| 
 | |
| FROM base AS build_pytorch
 | |
| ARG PYTORCH_BRANCH
 | |
| ARG PYTORCH_VISION_BRANCH
 | |
| ARG PYTORCH_REPO
 | |
| ARG PYTORCH_VISION_REPO
 | |
| ARG FA_BRANCH
 | |
| ARG FA_REPO
 | |
| RUN git clone ${PYTORCH_REPO} pytorch
 | |
| RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
 | |
|     pip install -r requirements.txt && git submodule update --init --recursive \
 | |
|     && python3 tools/amd_build/build_amd.py \
 | |
|     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
 | |
|     && pip install dist/*.whl
 | |
| RUN git clone ${PYTORCH_VISION_REPO} vision
 | |
| RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
 | |
|     && python3 setup.py bdist_wheel --dist-dir=dist \
 | |
|     && pip install dist/*.whl
 | |
| RUN git clone ${FA_REPO}
 | |
| RUN . .venv/bin/activate && cd flash-attention \
 | |
|     && git checkout ${FA_BRANCH} \
 | |
|     && git submodule update --init \
 | |
|     && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 | |
| RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
 | |
|     && cp /app/vision/dist/*.whl /app/install \
 | |
|     && cp /app/flash-attention/dist/*.whl /app/install
 | |
| 
 | |
| FROM base AS final
 | |
| RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
 | |
|     dpkg -i /install/*deb \
 | |
|     && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
 | |
|     && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
 | |
| RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
 | |
|     dpkg -i /install/*deb \
 | |
|     && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
 | |
|     && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
 | |
| RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
 | |
|     . .venv/bin/activate && \
 | |
|     pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 | |
|     . .venv/bin/activate && \
 | |
|     pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
 | |
|     . .venv/bin/activate && \
 | |
|     pip install /install/*.whl
 | |
| 
 | |
| ARG AITER_REPO
 | |
| ARG AITER_BRANCH
 | |
| RUN git clone --recursive ${AITER_REPO}
 | |
| RUN . .venv/bin/activate && cd aiter \
 | |
|     && git checkout ${AITER_BRANCH} \
 | |
|     && git submodule update --init --recursive \
 | |
|     && pip install -r requirements.txt \
 | |
|     && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
 | |
| 
 | |
| RUN rm -rf /var/lib/apt/lists/*
 | |
| 
 | |
| FROM final AS kernel-builder
 | |
| # # Build vllm kernels
 | |
| FROM kernel-builder AS vllm-builder
 | |
| 
 | |
| COPY server/Makefile-vllm Makefile
 | |
| RUN . .venv/bin/activate && pip install setuptools_scm
 | |
| 
 | |
| # Build specific version of vllm
 | |
| RUN . .venv/bin/activate && make build-vllm-rocm
 | |
| 
 | |
| # Build Transformers CUDA kernels (gpt-neox and bloom)
 | |
| FROM kernel-builder AS custom-kernels-builder
 | |
| COPY server/custom_kernels/ .
 | |
| RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 | |
| 
 | |
| # Build exllama kernels
 | |
| FROM kernel-builder AS exllama-kernels-builder
 | |
| COPY server/exllama_kernels/ .
 | |
| RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 | |
| 
 | |
| # Build exllama v2 kernels
 | |
| FROM kernel-builder AS exllamav2-kernels-builder
 | |
| COPY server/exllamav2_kernels/ .
 | |
| RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 | |
| 
 | |
| FROM kernel-builder AS marlin-kernels
 | |
| ENV MARLIN_KERNELS_BRANCH=v0.3.6
 | |
| ENV VLLM_TARGET_DEVICE=rocm
 | |
| RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
 | |
|     cd marlin-kernels && \
 | |
|     git checkout ${MARLIN_KERNELS_BRANCH} && \
 | |
|     python3 setup.py bdist_wheel --dist-dir=dist
 | |
| 
 | |
| FROM kernel-builder AS moe-kernels
 | |
| ENV MOE_KERNELS_BRANCH=v0.8.2
 | |
| ENV VLLM_TARGET_DEVICE=rocm
 | |
| RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
 | |
|     cd moe-kernels && \
 | |
|     git checkout ${MOE_KERNELS_BRANCH} && \
 | |
|     python3 setup.py bdist_wheel --dist-dir=dist
 | |
| 
 | |
| FROM final AS base-copy
 | |
| 
 | |
| # Text Generation Inference base env
 | |
| ENV HF_HOME=/data \
 | |
|     HF_HUB_ENABLE_HF_TRANSFER=1 \
 | |
|     PORT=80
 | |
| 
 | |
| ENV VIRTUAL_ENV=/app/.venv/
 | |
| ENV PATH="$PATH:/app/.venv/bin/"
 | |
| 
 | |
| # Install server
 | |
| COPY proto proto
 | |
| COPY server server
 | |
| COPY server/Makefile server/Makefile
 | |
| RUN cd server && \
 | |
|     uv pip install grpcio-tools mypy-protobuf && \
 | |
|     uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
 | |
|     make gen-server-raw
 | |
| RUN cd server && \
 | |
|     pwd && \
 | |
|     text-generation-server --help
 | |
| 
 | |
| RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
 | |
|     uv pip install /install/*.whl
 | |
| 
 | |
| # Install benchmarker
 | |
| COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 | |
| # Install router
 | |
| COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 | |
| # Install launcher
 | |
| COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
 | |
| 
 | |
| # AWS Sagemaker compatible image
 | |
| FROM base AS sagemaker
 | |
| 
 | |
| COPY sagemaker-entrypoint.sh entrypoint.sh
 | |
| RUN chmod +x entrypoint.sh
 | |
| 
 | |
| ENTRYPOINT ["./entrypoint.sh"]
 | |
| 
 | |
| # Final image
 | |
| FROM base-copy
 | |
| 
 | |
| # Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
 | |
| ENV HIP_FORCE_DEV_KERNARG=1
 | |
| 
 | |
| # On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
 | |
| # However, Triton requires a tunning for each prompt length, which is prohibitive.
 | |
| ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
 | |
| ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
 | |
| ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
 | |
| ENV VLLM_MOE_PADDING=0
 | |
| ENV ATTENTION=paged
 | |
| ENV PREFIX_CACHING=0
 | |
| ENV PREFILL_CHUNKING=0
 | |
| ENV ROCM_USE_SKINNY_GEMM=1
 | |
| 
 | |
| COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 | |
| RUN chmod +x /tgi-entrypoint.sh
 | |
| 
 | |
| ENTRYPOINT ["/tgi-entrypoint.sh"]
 | |
| ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
 | |
| ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
 | |
| # CMD ["--json-output"]
 |