doc: Update TRTLLM deployment doc. (#2960)

* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* fix: PR comments
This commit is contained in:
Hugo Larcher 2025-01-30 18:04:42 +01:00 committed by GitHub
parent cb747b33da
commit 065aabb13d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 155 additions and 47 deletions

View File

@ -64,7 +64,7 @@ jobs:
export runs_on="aws-g6-12xl-plus-priv-cache" export runs_on="aws-g6-12xl-plus-priv-cache"
export platform="" export platform=""
export extra_pytest="" export extra_pytest=""
export target="nil" export target=""
;; ;;
cuda-trtllm) cuda-trtllm)
export dockerfile="Dockerfile_trtllm" export dockerfile="Dockerfile_trtllm"
@ -74,7 +74,13 @@ jobs:
export runs_on="ubuntu-latest" export runs_on="ubuntu-latest"
export platform="" export platform=""
export extra_pytest="" export extra_pytest=""
export build_type="dev" if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then
export build_type="release";
export target="";
else
export build_type="dev";
export target="ci-runtime";
fi
;; ;;
rocm) rocm)
export dockerfile="Dockerfile_amd" export dockerfile="Dockerfile_amd"
@ -85,7 +91,7 @@ jobs:
export runs_on="ubuntu-latest" export runs_on="ubuntu-latest"
export platform="" export platform=""
export extra_pytest="-k test_flash_gemma_gptq_load" export extra_pytest="-k test_flash_gemma_gptq_load"
export target="nil" export target=""
;; ;;
intel-xpu) intel-xpu)
export dockerfile="Dockerfile_intel" export dockerfile="Dockerfile_intel"
@ -95,7 +101,7 @@ jobs:
export runs_on="ubuntu-latest" export runs_on="ubuntu-latest"
export platform="xpu" export platform="xpu"
export extra_pytest="" export extra_pytest=""
export target="nil" export target=""
;; ;;
intel-cpu) intel-cpu)
export dockerfile="Dockerfile_intel" export dockerfile="Dockerfile_intel"
@ -106,7 +112,7 @@ jobs:
export runs_on="aws-highmemory-32-plus-priv" export runs_on="aws-highmemory-32-plus-priv"
export platform="cpu" export platform="cpu"
export extra_pytest="-k test_flash_gemma_simple" export extra_pytest="-k test_flash_gemma_simple"
export target="nil" export target=""
;; ;;
esac esac
echo $dockerfile echo $dockerfile
@ -193,7 +199,7 @@ jobs:
sccache_gha_enabled=on sccache_gha_enabled=on
actions_cache_url=${{ env.ACTIONS_CACHE_URL }} actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }} actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
target: ${{ env.TARGET }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max

View File

@ -123,15 +123,6 @@ COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
FROM runtime
LABEL co.huggingface.vendor="Hugging Face Inc."
LABEL org.opencontainers.image.authors="hardware@hf.co"
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
ENTRYPOINT ["./text-generation-launcher"]
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
# This is used only for the CI/CD # This is used only for the CI/CD
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \ RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
@ -152,3 +143,13 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
# Basically we copy from target/debug instead of target/release # Basically we copy from target/debug instead of target/release
COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
# This is the final image
FROM runtime
LABEL co.huggingface.vendor="Hugging Face Inc."
LABEL org.opencontainers.image.authors="hardware@hf.co"
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
ENTRYPOINT ["./text-generation-launcher"]
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]

View File

@ -4,8 +4,13 @@ The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
that uses NVIDIA's TensorRT library for inference acceleration. that uses NVIDIA's TensorRT library for inference acceleration.
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels. It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
To use the TRTLLM backend you need to compile `engines` for the models you want to use. To use the TRTLLM backend **you need to compile** `engines` for the models you want to use.
Each `engine` must be compiled on the same GPU architecture that you will use for inference. Each `engine` must be compiled for a given set of:
- GPU architecture that you will use for inference (e.g. A100, L40, etc.)
- Maximum batch size
- Maximum input length
- Maximum output length
- Maximum beams width
## Supported models ## Supported models
@ -19,63 +24,159 @@ want to use.
```bash ```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
DESTINATION="/tmp/engines/$MODEL_NAME"
# Install huggingface_cli HF_TOKEN="hf_xxx"
python -m pip install huggingface-cli[hf_transfer]
# Login to the Hugging Face Hub
huggingface-cli login
# Create a directory to store the model
mkdir -p /tmp/models/$MODEL_NAME
# Create a directory to store the compiled engine
mkdir -p /tmp/engines/$MODEL_NAME
# Download the model
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
# Compile the engine using Optimum-NVIDIA # Compile the engine using Optimum-NVIDIA
# This will create a compiled engine in the /tmp/engines/meta-llama/Llama-3.1-8B-Instruct
# directory for 1 GPU
docker run \ docker run \
--rm \ --rm \
-it \ -it \
--gpus=1 \ --gpus=1 \
-v /tmp/models/$MODEL_NAME:/model \ --shm-size=1g \
-v /tmp/engines/$MODEL_NAME:/engine \ -v "$DESTINATION":/engine \
huggingface/optimum-nvidia \ -e HF_TOKEN=$HF_TOKEN \
optimum-cli export trtllm \ -e HF_HUB_ENABLE_HF_TRANSFER=1 \
huggingface/optimum-nvidia:v0.1.0b9-py310 \
bash -c "optimum-cli export trtllm \
--tp=1 \ --tp=1 \
--pp=1 \ --pp=1 \
--max-batch-size=128 \ --max-batch-size=64 \
--max-input-length 4096 \ --max-input-length 4096 \
--max-output-length 8192 \ --max-output-length 8192 \
--max-beams-width=1 \ --max-beams-width=1 \
--destination /engine \ --destination /tmp/engine \
$MODEL_NAME $MODEL_NAME && cp -rL /tmp/engine/* /engine/"
``` ```
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory. Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory, in a subfolder named after the GPU used to compile the model.
## Using the TRTLLM backend ## Using the TRTLLM backend
Run TGI-TRTLLM Docker image with the compiled engine: Run TGI-TRTLLM Docker image with the compiled engine:
```bash ```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
DESTINATION="/tmp/engines/$MODEL_NAME"
HF_TOKEN="hf_xxx"
docker run \ docker run \
--gpus 1 \ --gpus 1 \
--shm-size=1g \
-it \ -it \
--rm \ --rm \
-p 3000:3000 \ -p 3000:3000 \
-e MODEL=$MODEL_NAME \ -e MODEL=$MODEL_NAME \
-e PORT=3000 \ -e PORT=3000 \
-e HF_TOKEN='hf_XXX' \ -e HF_TOKEN=$HF_TOKEN \
-v /tmp/engines/$MODEL_NAME:/data \ -v "$DESTINATION"/<YOUR_GPU_ARCHITECTURE>/engines:/data \
ghcr.io/huggingface/text-generation-inference:latest-trtllm \ ghcr.io/huggingface/text-generation-inference:latest-trtllm \
--executor-worker executorWorker \ --model-id /data/ \
--model-id /data/$MODEL_NAME --tokenizer-name $MODEL_NAME
``` ```
## Development ## Development
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) with the following `.devcontainer.json` file:
`.devcontainer` directory. ```json
{
"name": "CUDA",
"build": {
"dockerfile": "Dockerfile_trtllm",
"context": ".."
},
"remoteEnv": {
"PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
},
"customizations" : {
"jetbrains" : {
"backend" : "CLion"
}
}
}
```
and `Dockerfile_trtllm`:
```Dockerfile
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
ARG build_type=release
ARG ompi_version=4.1.7
# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
cmake \
curl \
gcc-14 \
g++-14 \
git \
git-lfs \
lld \
libssl-dev \
libucx-dev \
libasan8 \
libubsan1 \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget --no-install-recommends && \
pipx ensurepath
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
# Install OpenMPI
FROM cuda-builder AS mpi-builder
WORKDIR /opt/src/mpi
ARG ompi_version
ENV OMPI_VERSION=${ompi_version}
ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .
RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
make -j all && \
make install && \
rm -rf ${OMPI_TARBALL_FILENAME}/..
# Install TensorRT
FROM cuda-builder AS trt-builder
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
/opt/install_tensorrt.sh
# Build Backend
FROM cuda-builder AS tgi-builder
WORKDIR /usr/src/text-generation-inference
# Scoped global args reuse
ARG cuda_arch_list
ARG build_type
ARG sccache_gha_enabled
ARG actions_cache_url
ARG actions_runtime_token
# Install Rust
ENV PATH="/root/.cargo/bin:$PATH"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
chmod -R a+w /root/.rustup && \
chmod -R a+w /root/.cargo && \
cargo install sccache --locked
ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
ENV USE_LLD_LINKER=ON
ENV CUDA_ARCH_LIST=${cuda_arch_list}
```