mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
* Patch rust release. * Trying to remove the rust-toolchain hardcoded in action. * Upgrade rust toolchain. * Put back the toolchain ? * Fix neuron dockerfile. * Move to the proper version of Rust. * 1.85 since the GH action doesn't respect the override. * Typo. * Fixing the github action. * Fixing docker llamacpp. * Fixing the github action. * Update clippy.
259 lines
9.4 KiB
Docker
259 lines
9.4 KiB
Docker
# Rust builder
|
|
FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
|
|
WORKDIR /usr/src
|
|
|
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
|
|
|
FROM chef AS planner
|
|
COPY Cargo.lock Cargo.lock
|
|
COPY Cargo.toml Cargo.toml
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
COPY proto proto
|
|
COPY benchmark benchmark
|
|
COPY router router
|
|
COPY backends backends
|
|
COPY launcher launcher
|
|
|
|
RUN cargo chef prepare --recipe-path recipe.json
|
|
|
|
FROM chef AS builder
|
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
python3.11-dev
|
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
|
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
|
rm -f $PROTOC_ZIP
|
|
|
|
COPY --from=planner /usr/src/recipe.json recipe.json
|
|
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
|
|
|
|
ARG GIT_SHA
|
|
ARG DOCKER_LABEL
|
|
|
|
COPY Cargo.lock Cargo.lock
|
|
COPY Cargo.toml Cargo.toml
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
COPY proto proto
|
|
COPY benchmark benchmark
|
|
COPY router router
|
|
COPY backends backends
|
|
COPY launcher launcher
|
|
RUN cargo build --profile release-opt --frozen
|
|
|
|
# Python builder
|
|
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
|
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
|
|
WORKDIR /usr/src/
|
|
|
|
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
|
|
ARG PYTORCH_VERSION=2.6
|
|
ARG PYTHON_VERSION=3.11
|
|
|
|
# Keep in sync with `server/pyproject.toml
|
|
# Automatically set by buildx
|
|
ARG TARGETPLATFORM
|
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
build-essential \
|
|
ca-certificates \
|
|
ccache \
|
|
curl \
|
|
git && \
|
|
rm -rf /var/lib/apt/lists/*
|
|
COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
|
|
ENV PATH="$PATH:/root/.local/bin"
|
|
RUN uv python install ${PYTHON_VERSION}
|
|
RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} pip setuptools packaging
|
|
ENV VIRTUAL_ENV=/usr/src/.venv/
|
|
ENV PATH="$PATH:/usr/src/.venv/bin/"
|
|
|
|
# CUDA kernels builder image
|
|
FROM pytorch-install AS kernel-builder
|
|
|
|
ARG MAX_JOBS=8
|
|
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
|
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
ninja-build cmake \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Build Flash Attention CUDA kernels
|
|
FROM kernel-builder AS flash-att-builder
|
|
|
|
WORKDIR /usr/src
|
|
|
|
COPY server/Makefile-flash-att Makefile
|
|
|
|
# Build specific version of flash attention
|
|
RUN . .venv/bin/activate && make build-flash-attention
|
|
|
|
# Build Flash Attention v2 CUDA kernels
|
|
FROM kernel-builder AS flash-att-v2-builder
|
|
|
|
WORKDIR /usr/src
|
|
|
|
COPY server/Makefile-flash-att-v2 Makefile
|
|
|
|
# Build specific version of flash attention v2
|
|
RUN . .venv/bin/activate && make build-flash-attention-v2-cuda
|
|
|
|
# Build Transformers exllama kernels
|
|
FROM kernel-builder AS exllama-kernels-builder
|
|
WORKDIR /usr/src
|
|
COPY server/exllama_kernels/ .
|
|
|
|
RUN . .venv/bin/activate && python setup.py build
|
|
|
|
# Build Transformers exllama kernels
|
|
FROM kernel-builder AS exllamav2-kernels-builder
|
|
WORKDIR /usr/src
|
|
COPY server/Makefile-exllamav2/ Makefile
|
|
|
|
# Build specific version of transformers
|
|
RUN . .venv/bin/activate && make build-exllamav2
|
|
|
|
# Build Transformers awq kernels
|
|
FROM kernel-builder AS awq-kernels-builder
|
|
WORKDIR /usr/src
|
|
COPY server/Makefile-awq Makefile
|
|
# Build specific version of transformers
|
|
RUN . .venv/bin/activate && make build-awq
|
|
|
|
# Build Lorax Punica kernels
|
|
FROM kernel-builder AS lorax-punica-builder
|
|
WORKDIR /usr/src
|
|
COPY server/Makefile-lorax-punica Makefile
|
|
# Build specific version of transformers
|
|
RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
|
|
|
|
# Build Transformers CUDA kernels
|
|
FROM kernel-builder AS custom-kernels-builder
|
|
WORKDIR /usr/src
|
|
COPY server/custom_kernels/ .
|
|
# Build specific version of transformers
|
|
RUN . .venv/bin/activate && python setup.py build
|
|
|
|
# Build mamba kernels
|
|
FROM kernel-builder AS mamba-builder
|
|
WORKDIR /usr/src
|
|
COPY server/Makefile-selective-scan Makefile
|
|
RUN . .venv/bin/activate && make build-all
|
|
|
|
# Build flashinfer
|
|
FROM kernel-builder AS flashinfer-builder
|
|
WORKDIR /usr/src
|
|
COPY server/Makefile-flashinfer Makefile
|
|
RUN . .venv/bin/activate && make install-flashinfer
|
|
|
|
# Text Generation Inference base image
|
|
FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base
|
|
|
|
# Text Generation Inference base env
|
|
ENV HF_HOME=/data \
|
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
PORT=80
|
|
|
|
WORKDIR /usr/src
|
|
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
libssl-dev \
|
|
ca-certificates \
|
|
make \
|
|
curl \
|
|
git \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
ENV PATH="$PATH:/root/.local/bin"
|
|
# Install flash-attention dependencies
|
|
# RUN pip install einops --no-cache-dir
|
|
|
|
# Copy env with PyTorch installed
|
|
COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
|
|
ENV PYTHON_VERSION=3.11
|
|
RUN uv python install ${PYTHON_VERSION}
|
|
ENV VIRTUAL_ENV=/usr/src/.venv/
|
|
ENV PATH="$PATH:/usr/src/.venv/bin/"
|
|
|
|
# Install server
|
|
COPY proto proto
|
|
COPY server server
|
|
COPY server/Makefile server/Makefile
|
|
ENV HF_KERNELS_CACHE=/kernels
|
|
RUN cd server && \
|
|
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project --active && \
|
|
make gen-server-raw && \
|
|
hf-kernels download .
|
|
|
|
RUN cd server && \
|
|
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
|
|
uv pip install nvidia-nccl-cu12==2.25.1 && \
|
|
pwd && \
|
|
text-generation-server --help
|
|
|
|
# Copy build artifacts from flash attention builder
|
|
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
|
|
# Copy build artifacts from flash attention v2 builder
|
|
COPY --from=flash-att-v2-builder /usr/src/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/.venv/lib/python3.11/site-packages
|
|
|
|
# Copy build artifacts from custom kernels builder
|
|
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
# Copy build artifacts from exllama kernels builder
|
|
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
# Copy build artifacts from exllamav2 kernels builder
|
|
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
# Copy build artifacts from awq kernels builder
|
|
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
# Copy build artifacts from lorax punica kernels builder
|
|
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
|
# Copy build artifacts from mamba builder
|
|
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
|
|
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
|
|
COPY --from=flashinfer-builder /usr/src/.venv/lib/python3.11/site-packages/flashinfer/ /usr/src/.venv/lib/python3.11/site-packages/flashinfer/
|
|
|
|
|
|
# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
|
# Required to find libpython within the rust binaries
|
|
# This is needed because exl2 tries to load flash-attn
|
|
# And fails with our builds.
|
|
ENV EXLLAMA_NO_FLASH_ATTN=1
|
|
|
|
# Deps before the binaries
|
|
# The binaries change on every build given we burn the SHA into them
|
|
# The deps change less often.
|
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
|
build-essential \
|
|
g++ \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install benchmarker
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
|
# Install router
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
|
|
# Install launcher
|
|
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
|
|
|
|
|
|
# AWS Sagemaker compatible image
|
|
FROM base AS sagemaker
|
|
|
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
|
RUN chmod +x entrypoint.sh
|
|
|
|
ENTRYPOINT ["./entrypoint.sh"]
|
|
|
|
# Final image
|
|
FROM base
|
|
|
|
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
|
|
RUN chmod +x /tgi-entrypoint.sh
|
|
|
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
|
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
|
# CMD ["--json-output"]
|