mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-20 06:40:19 +00:00
Give TensorRT-LLMa proper CI/CD 😍 (#2886)
* test(ctest) enable address sanitizer * feat(trtllm): expose finish reason to Rust * feat(trtllm): fix logits retrieval * misc(ci): enabe building tensorrt-llm * misc(ci): update Rust action toolchain * misc(ci): let's try to build the Dockerfile for trtllm # Conflicts: # Dockerfile_trtllm * misc(ci): provide mecanism to cache inside container * misc(ci): export aws creds as output of step * misc(ci): let's try this way * misc(ci): again * misc(ci): again * misc(ci): add debug profile * misc(ci): add debug profile * misc(ci): lets actually use sccache ... * misc(ci): do not build with ssl enabled * misc(ci): WAT * misc(ci): WAT * misc(ci): WAT * misc(ci): WAT * misc(ci): WAT * misc(backend): test with TGI S3 conf * misc(backend): test with TGI S3 conf * misc(backend): once more? * misc(backend): let's try with GHA * misc(backend): missing env directive * misc(backend): make sure to correctly set IS_GHA_BUILD=true in wf * misc(backend): ok let's debug smtg * misc(backend): WWWWWWWWWWWWWAAAAAAAA * misc(backend): kthxbye retry s3 * misc(backend): use session token * misc(backend): add more info * misc(backend): lets try 1h30 * misc(backend): lets try 1h30 * misc(backend): increase to 2h * misc(backend): lets try... * misc(backend): lets try... * misc(backend): let's build for ci-runtime * misc(backend): let's add some more tooling * misc(backend): add some tags * misc(backend): disable Werror for now * misc(backend): added automatic gha detection * misc(backend): remove leak sanitizer which is included in asan * misc(backend): forward env * misc(backend): forward env * misc(backend): let's try * misc(backend): let's try * misc(backend): again * misc(backend): again * misc(backend): again * misc(backend): again * misc(backend): again * misc(backend): fix sscache -> sccache * misc(backend): fix sscache -> sccache * misc(backend): fix sscache -> sccache * misc(backend): let's actually cache things now * misc(backend): let's actually cache things now * misc(backend): attempt to run the testS? * misc(backend): attempt to run the tests? * misc(backend): attempt to run the tests? * change runner size * fix: Correctly tag docker images (#2878) * fix: Correctly tag docker images * fix: Correctly tag docker images * misc(llamacpp): maybe? * misc(llamacpp): maybe? * misc(llamacpp): maybe? * misc(ci): gogogo * misc(ci): gogogo * misc(ci): gogogo * misc(ci): gogogo * misc(ci): gogogo * misc(ci): gogogo * misc(ci): go * misc(ci): go * misc(ci): go * misc(ci): use bin folder * misc(ci): make the wf callable for reuse * misc(ci): make the wf callable for reuse (bis) * misc(ci): make the wf callable for reuse (bis) * misc(ci): give the wf a name * Create test-trtllm.yml * Update test-trtllm.yml * Create build-trtllm2 * Rename build-trtllm2 to 1-build-trtllm2 * Rename test-trtllm.yml to 1-test-trtllm2.yml * misc(ci): fw secrets * Update 1-test-trtllm2.yml * Rename 1-build-trtllm2 to 1-build-trtllm2.yml * Update 1-test-trtllm2.yml * misc(ci): use ci-build.yaml as main dispatcher * Delete .github/workflows/1-test-trtllm2.yml * Delete .github/workflows/1-build-trtllm2.yml * misc(ci): rights? * misc(ci): rights? * misc(ci): once more? * misc(ci): once more? * misc(ci): baby more time? * misc(ci): baby more time? * misc(ci): try the permission above again? * misc(ci): try the permission above again? * misc(ci): try the permission scoped again? * misc(ci): install tensorrt_llm_executor_static * misc(ci): attempt to rebuild with sccache? * misc(ci):run the tests on GPU instance * misc(ci): let's actually setup sccache in the build.rs * misc(ci): reintroduce variables * misc(ci): enforce sccache * misc(ci): correct right job name dependency * misc(ci): detect dev profile for debug * misc(ci): detect gha build * misc(ci): detect gha build * misc(ci): ok debug * misc(ci): wtf * misc(ci): wtf2 * misc(ci): wtf3 * misc(ci): use commit HEAD instead of merge commit for image id * misc(ci): wtfinfini * misc(ci): wtfinfini * misc(ci): KAMEHAMEHA * Merge TRTLLM in standard CI * misc(ci): remove input machine * misc(ci): missing id-token for AWS auth * misc(ci): missing id-token for AWS auth * misc(ci): missing id-token for AWS auth * misc(ci): again... * misc(ci): again... * misc(ci): again... * misc(ci): again... * misc(ci): missing benchmark * misc(ci): missing backends * misc(ci): missing launcher * misc(ci): give everything aws needs * misc(ci): give everything aws needs * misc(ci): fix warnings * misc(ci): attempt to fix sccache not building trtllm * misc(ci): attempt to fix sccache not building trtllm again --------- Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com> Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co> Co-authored-by: Pauline Bailly-Masson <155966238+paulinebm@users.noreply.github.com>
This commit is contained in:
parent
b980848abf
commit
17367438f3
54
.github/workflows/build.yaml
vendored
54
.github/workflows/build.yaml
vendored
@ -31,15 +31,28 @@ jobs:
|
|||||||
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-highmemory-32-plus-priv
|
group: aws-highmemory-64-plus-priv
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
packages: write
|
packages: write
|
||||||
|
id-token: write
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
- name: Inject slug/short variables
|
- name: Inject slug/short variables
|
||||||
uses: rlespinasse/github-slug-action@v4.4.1
|
uses: rlespinasse/github-slug-action@v4.4.1
|
||||||
|
- name: Extract TensorRT-LLM version
|
||||||
|
run: |
|
||||||
|
echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
|
||||||
|
echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
|
||||||
|
- name: "Configure AWS Credentials"
|
||||||
|
id: aws-creds
|
||||||
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
|
with:
|
||||||
|
aws-region: us-east-1
|
||||||
|
role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
|
||||||
|
role-duration-seconds: 7200
|
||||||
|
output-credentials: true
|
||||||
- name: Construct harware variables
|
- name: Construct harware variables
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
@ -52,6 +65,7 @@ jobs:
|
|||||||
export runs_on="aws-g6-12xl-plus-priv-cache"
|
export runs_on="aws-g6-12xl-plus-priv-cache"
|
||||||
export platform=""
|
export platform=""
|
||||||
export extra_pytest=""
|
export extra_pytest=""
|
||||||
|
export target="nil"
|
||||||
;;
|
;;
|
||||||
cuda-trtllm)
|
cuda-trtllm)
|
||||||
export dockerfile="Dockerfile_trtllm"
|
export dockerfile="Dockerfile_trtllm"
|
||||||
@ -61,6 +75,10 @@ jobs:
|
|||||||
export runs_on="ubuntu-latest"
|
export runs_on="ubuntu-latest"
|
||||||
export platform=""
|
export platform=""
|
||||||
export extra_pytest=""
|
export extra_pytest=""
|
||||||
|
export target="ci-runtime"
|
||||||
|
export sccache_s3_key_prefix="trtllm"
|
||||||
|
export sccache_region="us-east-1"
|
||||||
|
export build_type="dev"
|
||||||
;;
|
;;
|
||||||
rocm)
|
rocm)
|
||||||
export dockerfile="Dockerfile_amd"
|
export dockerfile="Dockerfile_amd"
|
||||||
@ -71,6 +89,7 @@ jobs:
|
|||||||
export runs_on="ubuntu-latest"
|
export runs_on="ubuntu-latest"
|
||||||
export platform=""
|
export platform=""
|
||||||
export extra_pytest="-k test_flash_gemma_gptq_load"
|
export extra_pytest="-k test_flash_gemma_gptq_load"
|
||||||
|
export target="nil"
|
||||||
;;
|
;;
|
||||||
intel-xpu)
|
intel-xpu)
|
||||||
export dockerfile="Dockerfile_intel"
|
export dockerfile="Dockerfile_intel"
|
||||||
@ -80,6 +99,7 @@ jobs:
|
|||||||
export runs_on="ubuntu-latest"
|
export runs_on="ubuntu-latest"
|
||||||
export platform="xpu"
|
export platform="xpu"
|
||||||
export extra_pytest=""
|
export extra_pytest=""
|
||||||
|
export target="nil"
|
||||||
;;
|
;;
|
||||||
intel-cpu)
|
intel-cpu)
|
||||||
export dockerfile="Dockerfile_intel"
|
export dockerfile="Dockerfile_intel"
|
||||||
@ -90,6 +110,7 @@ jobs:
|
|||||||
export runs_on="aws-highmemory-32-plus-priv"
|
export runs_on="aws-highmemory-32-plus-priv"
|
||||||
export platform="cpu"
|
export platform="cpu"
|
||||||
export extra_pytest="-k test_flash_gemma_simple"
|
export extra_pytest="-k test_flash_gemma_simple"
|
||||||
|
export target="nil"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
echo $dockerfile
|
echo $dockerfile
|
||||||
@ -106,6 +127,10 @@ jobs:
|
|||||||
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
|
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
|
||||||
echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
|
echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
|
||||||
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
|
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
|
||||||
|
echo "TARGET=${target}" >> $GITHUB_ENV
|
||||||
|
echo "SCCACHE_S3_KEY_PREFIX=${sccache_s3_key_prefix}" >> $GITHUB_ENV
|
||||||
|
echo "SCCACHE_REGION=${sccache_region}" >> $GITHUB_ENV
|
||||||
|
echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
|
||||||
- name: Initialize Docker Buildx
|
- name: Initialize Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
@ -170,6 +195,14 @@ jobs:
|
|||||||
GIT_SHA=${{ env.GITHUB_SHA }}
|
GIT_SHA=${{ env.GITHUB_SHA }}
|
||||||
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
|
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
|
||||||
PLATFORM=${{ env.PLATFORM }}
|
PLATFORM=${{ env.PLATFORM }}
|
||||||
|
build_type=${{ env.BUILD_TYPE }}
|
||||||
|
is_gha_build=true
|
||||||
|
aws_access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }}
|
||||||
|
aws_secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }}
|
||||||
|
aws_session_token=${{ steps.aws-creds.outputs.aws-session-token }}
|
||||||
|
sccache_bucket=${{ secrets.AWS_S3_BUCKET_GITHUB_TGI_TEST }}
|
||||||
|
sccache_s3_key_prefix=${{ env.SCCACHE_S3_KEY_PREFIX }}
|
||||||
|
sccache_region=${{ env.SCCACHE_REGION }}
|
||||||
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
|
||||||
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
|
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
|
||||||
@ -215,3 +248,22 @@ jobs:
|
|||||||
echo $DOCKER_IMAGE
|
echo $DOCKER_IMAGE
|
||||||
docker pull $DOCKER_IMAGE
|
docker pull $DOCKER_IMAGE
|
||||||
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
|
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
|
||||||
|
|
||||||
|
backend_trtllm_cxx_tests:
|
||||||
|
needs: build-and-push
|
||||||
|
if: needs.build-and-push.outputs.label == '-trtllm'
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
runs-on:
|
||||||
|
group: aws-g6-12xl-plus-priv-cache
|
||||||
|
container:
|
||||||
|
image: ${{ needs.build-and-push.outputs.docker_image }}
|
||||||
|
credentials:
|
||||||
|
username: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
|
password: ${{ secrets.REGISTRY_PASSWORD }}
|
||||||
|
options: --gpus all --shm-size=8g
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Run C++/CUDA tests
|
||||||
|
run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
|
||||||
|
1
.github/workflows/ci_build.yaml
vendored
1
.github/workflows/ci_build.yaml
vendored
@ -42,6 +42,7 @@ jobs:
|
|||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
packages: write
|
packages: write
|
||||||
|
id-token: write
|
||||||
with:
|
with:
|
||||||
hardware: ${{ matrix.hardware }}
|
hardware: ${{ matrix.hardware }}
|
||||||
# https://github.com/actions/runner/issues/2206
|
# https://github.com/actions/runner/issues/2206
|
||||||
|
28
Cargo.toml
28
Cargo.toml
@ -1,21 +1,21 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = [
|
||||||
"benchmark",
|
"benchmark",
|
||||||
"backends/v2",
|
"backends/v2",
|
||||||
"backends/v3",
|
"backends/v3",
|
||||||
"backends/grpc-metadata",
|
"backends/grpc-metadata",
|
||||||
"backends/trtllm",
|
"backends/trtllm",
|
||||||
"launcher",
|
"launcher",
|
||||||
"router"
|
"router"
|
||||||
]
|
]
|
||||||
default-members = [
|
default-members = [
|
||||||
"benchmark",
|
"benchmark",
|
||||||
"backends/v2",
|
"backends/v2",
|
||||||
"backends/v3",
|
"backends/v3",
|
||||||
"backends/grpc-metadata",
|
"backends/grpc-metadata",
|
||||||
# "backends/trtllm",
|
# "backends/trtllm",
|
||||||
"launcher",
|
"launcher",
|
||||||
"router"
|
"router"
|
||||||
]
|
]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
|
@ -1,19 +1,7 @@
|
|||||||
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
|
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
|
||||||
ARG OMPI_VERSION="4.1.7rc1"
|
ARG ompi_version="4.1.7rc1"
|
||||||
|
ARG build_type=release
|
||||||
# Build dependencies resolver stage
|
ARG is_gha_build=false
|
||||||
FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
|
|
||||||
WORKDIR /usr/src/text-generation-inference/backends/trtllm
|
|
||||||
|
|
||||||
FROM chef AS planner
|
|
||||||
COPY Cargo.lock Cargo.lock
|
|
||||||
COPY Cargo.toml Cargo.toml
|
|
||||||
COPY rust-toolchain.toml rust-toolchain.toml
|
|
||||||
COPY router router
|
|
||||||
COPY benchmark/ benchmark/
|
|
||||||
COPY backends/ backends/
|
|
||||||
COPY launcher/ launcher/
|
|
||||||
RUN cargo chef prepare --recipe-path recipe.json
|
|
||||||
|
|
||||||
# CUDA dependent dependencies resolver stage
|
# CUDA dependent dependencies resolver stage
|
||||||
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||||
@ -26,8 +14,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
|||||||
g++-14 \
|
g++-14 \
|
||||||
git \
|
git \
|
||||||
git-lfs \
|
git-lfs \
|
||||||
|
lld \
|
||||||
libssl-dev \
|
libssl-dev \
|
||||||
libucx-dev \
|
libucx-dev \
|
||||||
|
libasan8 \
|
||||||
|
libubsan1 \
|
||||||
ninja-build \
|
ninja-build \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
pipx \
|
pipx \
|
||||||
@ -43,9 +34,9 @@ ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
|
|||||||
|
|
||||||
# Install OpenMPI
|
# Install OpenMPI
|
||||||
FROM cuda-builder AS mpi-builder
|
FROM cuda-builder AS mpi-builder
|
||||||
ARG OMPI_VERSION
|
ARG ompi_version
|
||||||
|
|
||||||
ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
|
ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2"
|
||||||
RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
|
RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
|
||||||
mkdir /usr/src/mpi && \
|
mkdir /usr/src/mpi && \
|
||||||
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
|
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
|
||||||
@ -65,34 +56,56 @@ RUN chmod +x /opt/install_tensorrt.sh && \
|
|||||||
FROM cuda-builder AS tgi-builder
|
FROM cuda-builder AS tgi-builder
|
||||||
WORKDIR /usr/src/text-generation-inference
|
WORKDIR /usr/src/text-generation-inference
|
||||||
|
|
||||||
|
# Scoped global args reuse
|
||||||
|
ARG is_gha_build
|
||||||
|
ARG build_type
|
||||||
|
|
||||||
# Install Rust
|
# Install Rust
|
||||||
|
ENV PATH="/root/.cargo/bin:$PATH"
|
||||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
|
||||||
chmod -R a+w /root/.rustup && \
|
chmod -R a+w /root/.rustup && \
|
||||||
chmod -R a+w /root/.cargo
|
chmod -R a+w /root/.cargo && \
|
||||||
|
cargo install sccache --locked
|
||||||
|
|
||||||
ENV PATH="/root/.cargo/bin:$PATH"
|
# SCCACHE Specifics args - before finding a better, more generic, way...
|
||||||
RUN cargo install cargo-chef
|
ARG aws_access_key_id
|
||||||
|
ARG aws_secret_access_key
|
||||||
|
ARG aws_session_token
|
||||||
|
ARG sccache_bucket
|
||||||
|
ARG sccache_s3_key_prefix
|
||||||
|
ARG sccache_region
|
||||||
|
|
||||||
# Cache dependencies
|
ENV AWS_ACCESS_KEY_ID=$aws_access_key_id
|
||||||
COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
|
ENV AWS_SECRET_ACCESS_KEY=$aws_secret_access_key
|
||||||
RUN cargo chef cook --release --recipe-path recipe.json
|
ENV AWS_SESSION_TOKEN=$aws_session_token
|
||||||
|
ENV SCCACHE_BUCKET=$sccache_bucket
|
||||||
|
ENV SCCACHE_S3_KEY_PREFIX=$sccache_s3_key_prefix
|
||||||
|
ENV SCCACHE_REGION=$sccache_region
|
||||||
|
|
||||||
# Build actual TGI
|
|
||||||
ARG CUDA_ARCH_LIST
|
|
||||||
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
|
|
||||||
ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
|
ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
|
||||||
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
|
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
|
||||||
|
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
|
||||||
|
|
||||||
|
ENV USE_LLD_LINKER=ON
|
||||||
|
ENV CUDA_ARCH_LIST=${cuda_arch_list}
|
||||||
|
ENV IS_GHA_BUILD=${is_gha_build}
|
||||||
|
|
||||||
COPY Cargo.lock Cargo.lock
|
COPY Cargo.lock Cargo.lock
|
||||||
COPY Cargo.toml Cargo.toml
|
COPY Cargo.toml Cargo.toml
|
||||||
COPY rust-toolchain.toml rust-toolchain.toml
|
COPY rust-toolchain.toml rust-toolchain.toml
|
||||||
COPY router router
|
COPY router router
|
||||||
COPY backends/trtllm backends/trtllm
|
COPY backends backends
|
||||||
|
COPY benchmark benchmark
|
||||||
|
COPY launcher launcher
|
||||||
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||||
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
||||||
|
|
||||||
RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
|
RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
|
||||||
cd backends/trtllm && \
|
python3 backends/trtllm/scripts/setup_sccache.py --is-gha-build ${is_gha_build} && \
|
||||||
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
|
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX \
|
||||||
|
RUSTC_WRAPPER=sccache \
|
||||||
|
cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
|
||||||
|
sccache --show-stats
|
||||||
|
|
||||||
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
|
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
|
||||||
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||||
@ -116,6 +129,28 @@ FROM runtime
|
|||||||
|
|
||||||
LABEL co.huggingface.vendor="Hugging Face Inc."
|
LABEL co.huggingface.vendor="Hugging Face Inc."
|
||||||
LABEL org.opencontainers.image.authors="hardware@hf.co"
|
LABEL org.opencontainers.image.authors="hardware@hf.co"
|
||||||
|
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
|
||||||
|
|
||||||
ENTRYPOINT ["./text-generation-launcher"]
|
ENTRYPOINT ["./text-generation-launcher"]
|
||||||
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
|
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
|
||||||
|
|
||||||
|
# This is used only for the CI/CD
|
||||||
|
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
|
||||||
|
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||||
|
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
|
||||||
|
pipx ensurepath && \
|
||||||
|
pipx install --include-deps transformers tokenizers
|
||||||
|
|
||||||
|
WORKDIR /usr/local/tgi/bin
|
||||||
|
|
||||||
|
ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
|
||||||
|
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
ENV OMPI_MCA_plm_rsh_agent=""
|
||||||
|
|
||||||
|
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
||||||
|
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||||
|
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
|
||||||
|
|
||||||
|
# Basically we copy from target/debug instead of target/release
|
||||||
|
COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
|
@ -1,11 +1,18 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
|
||||||
find_program(CCACHE_EXECUTABLE "ccache")
|
find_program(CCACHE_EXECUTABLE "ccache")
|
||||||
if (CCACHE_EXECUTABLE)
|
if (CCACHE_EXECUTABLE)
|
||||||
message(STATUS "Using ccache")
|
message(STATUS "Using ccache")
|
||||||
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
|
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
|
||||||
|
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
|
||||||
|
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
|
||||||
endif ()
|
endif ()
|
||||||
|
else ()
|
||||||
|
message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
|
||||||
|
set(CMAKE_C_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
|
||||||
|
set(CMAKE_CXX_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
|
||||||
|
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
||||||
@ -21,28 +28,31 @@ include(CheckCXXCompilerFlag)
|
|||||||
|
|
||||||
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
|
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
|
||||||
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
|
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
|
||||||
|
option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
|
||||||
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
|
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
|
||||||
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
|
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located")
|
||||||
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
|
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
|
||||||
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
|
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
|
||||||
|
|
||||||
# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
|
# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
|
||||||
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
|
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
|
||||||
|
find_package(MPI REQUIRED)
|
||||||
|
|
||||||
#### External dependencies ####
|
#### External dependencies ####
|
||||||
include(cmake/json.cmake)
|
include(cmake/json.cmake)
|
||||||
include(cmake/spdlog.cmake)
|
include(cmake/spdlog.cmake)
|
||||||
include(cmake/trtllm.cmake)
|
include(cmake/trtllm.cmake)
|
||||||
|
|
||||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
|
set(TGI_TRTLLM_BACKEND_DEBUG ON)
|
||||||
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
|
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
|
||||||
endif()
|
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
|
||||||
|
endif ()
|
||||||
|
|
||||||
# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
|
if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
|
||||||
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
|
message(STATUS "Using lld linker")
|
||||||
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
|
add_link_options("-fuse-ld=lld")
|
||||||
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
|
endif ()
|
||||||
endif()
|
|
||||||
|
|
||||||
# Let's build TRTLLM as part of CMake
|
# Let's build TRTLLM as part of CMake
|
||||||
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
||||||
@ -55,51 +65,68 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs
|
|||||||
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||||
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
|
||||||
# $<INSTALL_INTERFACE:csrc>
|
# $<INSTALL_INTERFACE:csrc>
|
||||||
)
|
)
|
||||||
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
|
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
|
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
|
||||||
|
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
|
||||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
|
||||||
else()
|
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
|
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
|
||||||
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
|
install(TARGETS tgi_trtllm_backend_impl)
|
||||||
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
|
install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
|
||||||
|
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
|
||||||
|
if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
|
||||||
|
install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
|
||||||
#### Unit Tests ####
|
#### Unit Tests ####
|
||||||
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
|
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
|
||||||
message(STATUS "Building tests")
|
message(STATUS "Building tests")
|
||||||
|
option(TGI_TRTLLM_BACKEND_ENABLE_ASAN "Enable AddressSanitizer")
|
||||||
|
option(TGI_TRTLLM_BACKEND_ENABLE_UBSAN "Enable UndefinedSanitizer")
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
Catch2
|
Catch2
|
||||||
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
|
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(Catch2)
|
FetchContent_MakeAvailable(Catch2)
|
||||||
|
|
||||||
|
# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
|
||||||
|
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
|
||||||
|
if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
|
||||||
|
message(STATUS "Enabling non-NVRO detection")
|
||||||
|
target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
|
||||||
|
message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
|
||||||
|
|
||||||
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
|
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
|
||||||
|
|
||||||
|
# target_compile_options(tgi_trtllm_backend_tests PRIVATE -Werror)
|
||||||
|
target_link_directories(tgi_trtllm_backend_tests PRIVATE "${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
|
||||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
|
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
|
||||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
|
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
|
||||||
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
|
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
|
||||||
|
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
|
||||||
|
|
||||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
if (${TGI_TRTLLM_BACKEND_ENABLE_ASAN})
|
||||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
message(STATUS "Enabled AddressSanitizer")
|
||||||
else()
|
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=address)
|
||||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
if (${TGI_TRTLLM_BACKEND_ENABLE_UBSAN})
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
message(STATUS "Enabled UndefinedSanitizer")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined)
|
||||||
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
|
endif ()
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
|
install(TARGETS tgi_trtllm_backend_tests)
|
||||||
include(CTest)
|
|
||||||
include(Catch)
|
# list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
|
||||||
catch_discover_tests(tgi_trtllm_backend_tests)
|
# include(CTest)
|
||||||
|
# include(Catch)
|
||||||
|
# catch_discover_tests(tgi_trtllm_backend_tests)
|
||||||
endif ()
|
endif ()
|
@ -3,6 +3,7 @@ use pkg_config;
|
|||||||
use std::env;
|
use std::env;
|
||||||
use std::env::consts::ARCH;
|
use std::env::consts::ARCH;
|
||||||
use std::path::{absolute, PathBuf};
|
use std::path::{absolute, PathBuf};
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
||||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||||
@ -12,12 +13,20 @@ const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
|
|||||||
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
|
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
|
||||||
const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
|
const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
|
||||||
|
|
||||||
|
const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
|
||||||
|
option_env!("IS_GHA_BUILD").map_or(false, |value| match value.to_lowercase().as_str() {
|
||||||
|
"on" => true,
|
||||||
|
"true" => true,
|
||||||
|
"1" => true,
|
||||||
|
_ => false,
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
// Dependencies
|
// Dependencies
|
||||||
const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
|
const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
|
||||||
const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
|
const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
|
||||||
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
|
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
|
||||||
("dylib", "tensorrt_llm"),
|
("dylib", "tensorrt_llm"),
|
||||||
("static", "tensorrt_llm_executor_static"),
|
|
||||||
("dylib", "tensorrt_llm_nvrtc_wrapper"),
|
("dylib", "tensorrt_llm_nvrtc_wrapper"),
|
||||||
("dylib", "nvinfer_plugin_tensorrt_llm"),
|
("dylib", "nvinfer_plugin_tensorrt_llm"),
|
||||||
("dylib", "decoder_attention"),
|
("dylib", "decoder_attention"),
|
||||||
@ -32,6 +41,48 @@ macro_rules! probe {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_compiler_flag(
|
||||||
|
switch: bool,
|
||||||
|
true_case: &'static str,
|
||||||
|
false_case: &'static str,
|
||||||
|
) -> &'static str {
|
||||||
|
match switch {
|
||||||
|
true => true_case,
|
||||||
|
false => false_case,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_library_architecture() -> &'static str {
|
||||||
|
let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
|
||||||
|
let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
|
||||||
|
let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
|
||||||
|
|
||||||
|
match os.as_str() {
|
||||||
|
"linux" => {
|
||||||
|
if env != "gnu" {
|
||||||
|
panic!("unsupported linux ABI {env}, only 'gnu' is supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
match arch.as_str() {
|
||||||
|
"x86_64" => "x86_64-linux-gnu",
|
||||||
|
"aarch64" => "aarch64-linux-gnu",
|
||||||
|
_ => panic!("unsupported linux architecture {arch}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"windows" => {
|
||||||
|
if env != "msvc" {
|
||||||
|
panic!("unsupported windows ABI {env}, only 'msvc' is supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
match arch.as_str() {
|
||||||
|
"x86_64" => "x86_64-windows-msvc",
|
||||||
|
_ => panic!("unsupported windows architecture {arch}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => panic!("unsupported OS {os}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
|
fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
|
||||||
// Build the backend implementation through CMake
|
// Build the backend implementation through CMake
|
||||||
let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
|
let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
|
||||||
@ -54,10 +105,45 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
|||||||
.env("OPT_LEVEL", opt_level)
|
.env("OPT_LEVEL", opt_level)
|
||||||
.define("CMAKE_INSTALL_PREFIX", &install_path)
|
.define("CMAKE_INSTALL_PREFIX", &install_path)
|
||||||
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
|
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
|
||||||
.define("Python3_ROOT_DIR", "../venv")
|
.define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
|
||||||
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
|
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
|
||||||
|
.define(
|
||||||
|
"TGI_TRTLLM_BACKEND_DEBUG",
|
||||||
|
get_compiler_flag(is_debug, "ON", "OFF"),
|
||||||
|
)
|
||||||
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
|
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
|
||||||
|
|
||||||
|
if is_debug || *IS_GHA_BUILD {
|
||||||
|
config.define("TGI_TRTLLM_BACKEND_BUILD_TESTS", "ON");
|
||||||
|
}
|
||||||
|
|
||||||
|
if option_env!("USE_LLD_LINKER").is_some() {
|
||||||
|
println!("cargo:warning=Using lld linker");
|
||||||
|
config.define("TGI_TRTLLM_BACKEND_BUILD_USE_LLD", "ON");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_debug && option_env!("ENABLE_ASAN").is_some()) || *IS_GHA_BUILD {
|
||||||
|
println!("cargo:warning=Enabling Address Sanitizer");
|
||||||
|
config.define("TGI_TRTLLM_BACKEND_ENABLE_ASAN", "ON");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_debug && option_env!("ENABLE_UBSAN").is_some()) || *IS_GHA_BUILD {
|
||||||
|
println!("cargo:warning=Enabling Undefined Sanitizer");
|
||||||
|
config.define("TGI_TRTLLM_BACKEND_ENABLE_UBSAN", "ON");
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
|
||||||
|
config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(wrapper) = option_env!("RUSTC_WRAPPER") {
|
||||||
|
println!("cargo:warning=Using caching tool: {wrapper}");
|
||||||
|
|
||||||
|
env::set_var("CMAKE_C_COMPILER_LAUNCHER", wrapper);
|
||||||
|
env::set_var("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
|
||||||
|
env::set_var("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
|
||||||
|
}
|
||||||
|
|
||||||
// Allow to override which Python to use ...
|
// Allow to override which Python to use ...
|
||||||
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
|
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
|
||||||
config.define("Python3_EXECUTABLE", python3);
|
config.define("Python3_EXECUTABLE", python3);
|
||||||
@ -78,23 +164,18 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Emit linkage information from the artifacts we just built
|
// Emit linkage information from the artifacts we just built
|
||||||
let install_lib_path = install_path.join("lib");
|
for path in ["lib", "lib64"] {
|
||||||
|
let install_lib_path = install_path.join(path);
|
||||||
println!(
|
println!(
|
||||||
r"cargo:warning=Adding link search path: {}",
|
r"cargo:warning=Adding link search path: {}",
|
||||||
install_lib_path.display()
|
install_lib_path.display()
|
||||||
);
|
);
|
||||||
println!(r"cargo:rustc-link-search={}", install_lib_path.display());
|
println!(r"cargo:rustc-link-search={}", install_lib_path.display());
|
||||||
|
}
|
||||||
(PathBuf::from(install_path), deps_folder)
|
(PathBuf::from(install_path), deps_folder)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
||||||
let ndebug = match is_debug {
|
|
||||||
true => "1",
|
|
||||||
false => "0",
|
|
||||||
};
|
|
||||||
|
|
||||||
CFG.include_prefix = "backends/trtllm";
|
CFG.include_prefix = "backends/trtllm";
|
||||||
cxx_build::bridge("src/lib.rs")
|
cxx_build::bridge("src/lib.rs")
|
||||||
.static_flag(true)
|
.static_flag(true)
|
||||||
@ -106,7 +187,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
|||||||
.include("/usr/local/tensorrt/include")
|
.include("/usr/local/tensorrt/include")
|
||||||
.include("csrc/")
|
.include("csrc/")
|
||||||
.file("csrc/ffi.hpp")
|
.file("csrc/ffi.hpp")
|
||||||
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
|
.define(
|
||||||
|
"TGI_TRTLLM_BACKEND_DEBUG",
|
||||||
|
get_compiler_flag(is_debug, "ON", "OFF"),
|
||||||
|
)
|
||||||
.compile("tgi_trtllm_backend");
|
.compile("tgi_trtllm_backend");
|
||||||
|
|
||||||
println!("cargo:rerun-if-changed=CMakeLists.txt");
|
println!("cargo:rerun-if-changed=CMakeLists.txt");
|
||||||
@ -125,6 +209,7 @@ fn main() {
|
|||||||
let build_profile = env::var("PROFILE").unwrap();
|
let build_profile = env::var("PROFILE").unwrap();
|
||||||
let (is_debug, opt_level) = match build_profile.as_ref() {
|
let (is_debug, opt_level) = match build_profile.as_ref() {
|
||||||
"debug" => (true, "0"),
|
"debug" => (true, "0"),
|
||||||
|
"dev" => (true, "0"),
|
||||||
_ => (false, "3"),
|
_ => (false, "3"),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -161,7 +246,5 @@ fn main() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Backend
|
// Backend
|
||||||
BACKEND_DEPS.iter().for_each(|name| {
|
println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS);
|
||||||
println!("cargo:rustc-link-lib=static={}", name);
|
|
||||||
});
|
|
||||||
}
|
}
|
@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF)
|
|||||||
|
|
||||||
# Define the level at which SPDLOG_ compilation level is defined
|
# Define the level at which SPDLOG_ compilation level is defined
|
||||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||||
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
|
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
|
||||||
else ()
|
else ()
|
||||||
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
|
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
fetchcontent_declare(
|
fetchcontent_declare(
|
||||||
spdlog
|
spdlog
|
||||||
# DOWNLOAD_EXTRACT_TIMESTAMP
|
# DOWNLOAD_EXTRACT_TIMESTAMP
|
||||||
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
|
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
|
||||||
)
|
)
|
||||||
fetchcontent_makeavailable(spdlog)
|
fetchcontent_makeavailable(spdlog)
|
||||||
|
@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|||||||
set(ENABLE_UCX OFF)
|
set(ENABLE_UCX OFF)
|
||||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||||
set(FAST_BUILD ON)
|
set(FAST_BUILD ON)
|
||||||
set(NVTX_DISABLE OFF)
|
set(NVTX_DISABLE ON)
|
||||||
|
set(INDEX_RANGE_CHECK ON)
|
||||||
else ()
|
else ()
|
||||||
set(FAST_BUILD OFF)
|
set(FAST_BUILD OFF)
|
||||||
set(FAST_MATH ON)
|
set(FAST_MATH ON)
|
||||||
set(NVTX_DISABLE ON)
|
set(NVTX_DISABLE OFF)
|
||||||
|
set(INDEX_RANGE_CHECK OFF)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
find_package(Python3 REQUIRED Interpreter)
|
find_package(Python3 REQUIRED Interpreter)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <ranges>
|
#include <ranges>
|
||||||
|
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include "backend.hpp"
|
#include "backend.hpp"
|
||||||
#include "hardware.hpp"
|
#include "hardware.hpp"
|
||||||
@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
if (world_size > 1) {
|
if (world_size > 1) {
|
||||||
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||||
mode = tle::CommunicationMode::kORCHESTRATOR;
|
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||||
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
|
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
|
||||||
|
true);
|
||||||
} else {
|
} else {
|
||||||
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||||
}
|
}
|
||||||
@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
||||||
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
||||||
|
|
||||||
size_t backend_t::num_tokens_ready() const noexcept {
|
size_t backend_t::num_tokens_ready() const noexcept {
|
||||||
return executor_.getNumResponsesReady();
|
return executor_.getNumResponsesReady();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::expected<request_id_t, backend_error_t>
|
std::expected<request_id_t, backend_error_t>
|
||||||
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
|
||||||
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
const sampling_params_t s_params) noexcept {
|
||||||
return executor_.enqueueRequest(tle::Request {
|
SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
|
||||||
|
return executor_.enqueueRequest(tle::Request{
|
||||||
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
||||||
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
|
static_cast<tle::SizeType32>(g_params.max_new_tokens),
|
||||||
true,
|
true,
|
||||||
(tle::SamplingConfig) sampling_params,
|
(tle::SamplingConfig) s_params,
|
||||||
tle::OutputConfig { /* returnLogProbs= */ true },
|
tle::OutputConfig{ /* returnLogProbs= */ true},
|
||||||
std::nullopt,
|
std::nullopt,
|
||||||
std::nullopt,
|
std::nullopt,
|
||||||
std::nullopt,
|
std::nullopt,
|
||||||
|
@ -28,20 +28,62 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
|
|
||||||
#include "backends/trtllm/src/lib.rs.h"
|
#include "backends/trtllm/src/lib.rs.h"
|
||||||
|
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
std::once_flag backend_initialized_flag;
|
std::once_flag backend_initialized_flag;
|
||||||
|
|
||||||
|
constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason reason) noexcept {
|
||||||
|
switch (reason) {
|
||||||
|
case tle::FinishReason::kNOT_FINISHED:
|
||||||
|
return finish_reason_t::kNOT_FINISHED;
|
||||||
|
case tle::FinishReason::kSTOP_WORDS:
|
||||||
|
return finish_reason_t::kSTOP_WORDS;
|
||||||
|
case tle::FinishReason::kEND_ID:
|
||||||
|
return finish_reason_t::kEND_ID;
|
||||||
|
case tle::FinishReason::kLENGTH:
|
||||||
|
return finish_reason_t::kLENGTH;
|
||||||
|
default:
|
||||||
|
std::unreachable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static auto as_generation_step = [](const tle::Response &r) {
|
||||||
|
const auto reqId = r.getRequestId();
|
||||||
|
if (!r.hasError()) [[likely]] {
|
||||||
|
const auto result = r.getResult();
|
||||||
|
const auto logits = result.logProbs.value()[0];
|
||||||
|
return generation_step_t{
|
||||||
|
reqId,
|
||||||
|
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||||
|
logits.back(),
|
||||||
|
result.isFinal,
|
||||||
|
as_finish_reason_t(result.finishReasons[0]),
|
||||||
|
false,
|
||||||
|
std::string()
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return generation_step_t{
|
||||||
|
reqId,
|
||||||
|
0,
|
||||||
|
0.0,
|
||||||
|
true,
|
||||||
|
finish_reason_t::kNOT_FINISHED,
|
||||||
|
true,
|
||||||
|
std::move(r.getErrorMsg())
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
class tensorrt_llm_backend_t {
|
class tensorrt_llm_backend_t {
|
||||||
private:
|
private:
|
||||||
backend_t inner_;
|
backend_t inner_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||||
: inner_(engine_folder, executor_worker_path) {}
|
: inner_(engine_folder, executor_worker_path) {}
|
||||||
|
|
||||||
size_t num_tokens_ready() const noexcept {
|
size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
|
||||||
return inner_.num_tokens_ready();
|
|
||||||
}
|
|
||||||
|
|
||||||
request_id_t submit(
|
request_id_t submit(
|
||||||
rust::Slice<const uint32_t> tokens,
|
rust::Slice<const uint32_t> tokens,
|
||||||
@ -59,13 +101,13 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
// Submit the request to the executor and get back a potential request_id used to track request status
|
// Submit the request to the executor and get back a potential request_id used to track request status
|
||||||
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
|
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
|
||||||
const auto maybe_request_id = inner_.submit(
|
const auto maybe_request_id = inner_.submit(
|
||||||
signed_tokens,
|
signed_tokens,
|
||||||
{max_new_tokens},
|
{max_new_tokens},
|
||||||
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
||||||
);
|
);
|
||||||
|
|
||||||
// If we do have a value, let's return the request_id
|
// If we do have a value, let's return the request_id
|
||||||
if(maybe_request_id.has_value()) [[likely]] {
|
if (maybe_request_id.has_value()) [[likely]] {
|
||||||
return *maybe_request_id;
|
return *maybe_request_id;
|
||||||
} else {
|
} else {
|
||||||
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
|
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
|
||||||
@ -74,61 +116,45 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
|
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
|
||||||
if(num_tokens_ready() > 0) [[likely]] {
|
if (num_tokens_ready() > 0) [[likely]] {
|
||||||
const auto responses = inner_.pull_tokens();
|
const auto responses = inner_.pull_tokens();
|
||||||
|
|
||||||
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
||||||
// Transform tle::Response to GenerationStep
|
|
||||||
auto steps = std::make_unique<std::vector<generation_step_t>>();
|
// Transform tle::Response to generation_step_t
|
||||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
#ifdef __cpp_lib_ranges_to_container
|
||||||
const auto reqId = r.getRequestId();
|
auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
|
||||||
if (!r.hasError()) [[likely]] {
|
#else
|
||||||
const auto result = r.getResult();
|
auto steps = std::vector<generation_step_t>();
|
||||||
return generation_step_t{
|
steps.reserve(responses.size());
|
||||||
reqId,
|
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
|
||||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
#endif
|
||||||
result.logProbs.value()[0][0],
|
return std::make_unique<std::vector<generation_step_t>>(steps);
|
||||||
result.isFinal,
|
|
||||||
false,
|
|
||||||
std::string()
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
return generation_step_t{
|
|
||||||
reqId,
|
|
||||||
0,
|
|
||||||
0.0,
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
std::move(r.getErrorMsg())
|
|
||||||
};
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return steps;
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
return std::make_unique<std::vector<generation_step_t>>();
|
return std::make_unique<std::vector<generation_step_t>>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cancel(request_id_t requestId) noexcept {
|
void cancel(request_id_t request_id) noexcept {
|
||||||
SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
|
SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
|
||||||
inner_.cancel(requestId);
|
inner_.cancel(request_id);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void initialize_logging() {
|
void initialize_logging() {
|
||||||
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
||||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||||
return std::tolower(c);
|
return std::tolower(c);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (log_level == "debug")
|
if (log_level == "debug")
|
||||||
spdlog::set_level(spdlog::level::debug);
|
spdlog::set_level(spdlog::level::debug);
|
||||||
else
|
else
|
||||||
spdlog::set_level(spdlog::level::info);
|
spdlog::set_level(spdlog::level::info);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
spdlog::set_level(spdlog::level::debug);
|
spdlog::set_level(spdlog::level::debug);
|
||||||
#endif
|
#endif
|
||||||
@ -151,11 +177,14 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
std::unique_ptr<tensorrt_llm_backend_t>
|
||||||
|
create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
||||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||||
return std::make_unique<tensorrt_llm_backend_t>(
|
return std::make_unique<tensorrt_llm_backend_t>(
|
||||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
|
||||||
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
|
std::filesystem::path::format::auto_format),
|
||||||
|
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
|
||||||
|
std::filesystem::path::format::auto_format)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
49
backends/trtllm/scripts/setup_sccache.py
Normal file
49
backends/trtllm/scripts/setup_sccache.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
AWS_S3_CACHING_VARIABLES = {
|
||||||
|
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
|
||||||
|
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
|
||||||
|
"AWS_SESSION_TOKEN": "aws_session_token",
|
||||||
|
"SCCACHE_REGION": "s3_region",
|
||||||
|
"SCCACHE_BUCKET": "s3_bucket_name",
|
||||||
|
}
|
||||||
|
|
||||||
|
ALL_CACHING_STORAGE_VARIABLES = {
|
||||||
|
"AWS_S3_CACHING_VARIABLES"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def setup_sccache_locally():
|
||||||
|
from os import environ
|
||||||
|
|
||||||
|
print("Setting up Local Caching Layer")
|
||||||
|
for target in ALL_CACHING_STORAGE_VARIABLES:
|
||||||
|
for envvar in globals()[target].keys():
|
||||||
|
if envvar in environ:
|
||||||
|
print(f"Deleted {envvar} from environment variables")
|
||||||
|
del environ[envvar]
|
||||||
|
|
||||||
|
|
||||||
|
def setup_sccache_for_s3():
|
||||||
|
from os import environ
|
||||||
|
|
||||||
|
print("Setting up AWS S3 Caching Layer")
|
||||||
|
for envvar in AWS_S3_CACHING_VARIABLES.keys():
|
||||||
|
if not envvar in environ or not environ[envvar] or len(environ[envvar]) == 0:
|
||||||
|
print(f"Missing definition for environment variable {envvar}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = ArgumentParser("TensorRT-LLM Build Caching Setup")
|
||||||
|
|
||||||
|
parser.add_argument("--is-gha-build", type=str, default="FALSE",
|
||||||
|
help="Indicate if the build is from Github Actions")
|
||||||
|
|
||||||
|
# Parse args
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.is_gha_build = args.is_gha_build.lower() in {"on", "true", "1"}
|
||||||
|
|
||||||
|
if args.is_gha_build:
|
||||||
|
setup_sccache_for_s3()
|
||||||
|
else:
|
||||||
|
setup_sccache_locally()
|
@ -6,6 +6,26 @@ mod utils;
|
|||||||
|
|
||||||
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
|
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
|
||||||
mod ffi {
|
mod ffi {
|
||||||
|
#[cxx_name = "finish_reason_t"]
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum FinishReason {
|
||||||
|
/// The request is not finished.
|
||||||
|
#[cxx_name = "kNOT_FINISHED"]
|
||||||
|
NotFinished = 0u8,
|
||||||
|
|
||||||
|
/// The request finished because the end id was generated.
|
||||||
|
#[cxx_name = "kEND_ID"]
|
||||||
|
EndTokenId = 1u8,
|
||||||
|
|
||||||
|
/// The request finished because a stop word was generated.
|
||||||
|
#[cxx_name = "kSTOP_WORDS"]
|
||||||
|
StopWords = 2u8,
|
||||||
|
|
||||||
|
/// The request finished because the maximum number of tokens was reached.
|
||||||
|
#[cxx_name = "kLENGTH"]
|
||||||
|
MaxLength = 3u8,
|
||||||
|
}
|
||||||
|
|
||||||
/// Struct used as shared type between rust and C++ to represent the result
|
/// Struct used as shared type between rust and C++ to represent the result
|
||||||
/// of a single decoding iteration
|
/// of a single decoding iteration
|
||||||
#[cxx_name = "generation_step_t"]
|
#[cxx_name = "generation_step_t"]
|
||||||
@ -15,6 +35,7 @@ mod ffi {
|
|||||||
token_id: u32,
|
token_id: u32,
|
||||||
log_prob: f32,
|
log_prob: f32,
|
||||||
is_final: bool,
|
is_final: bool,
|
||||||
|
finish_reason: FinishReason,
|
||||||
has_error: bool,
|
has_error: bool,
|
||||||
error_msg: String,
|
error_msg: String,
|
||||||
}
|
}
|
||||||
@ -66,3 +87,17 @@ mod ffi {
|
|||||||
fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
|
fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use ffi::FinishReason;
|
||||||
|
use text_generation_router::FinishReason as InferFinishReason;
|
||||||
|
|
||||||
|
impl From<FinishReason> for InferFinishReason {
|
||||||
|
fn from(reason: FinishReason) -> Self {
|
||||||
|
match reason {
|
||||||
|
FinishReason::StopWords => InferFinishReason::StopSequence,
|
||||||
|
FinishReason::MaxLength => InferFinishReason::Length,
|
||||||
|
FinishReason::EndTokenId => InferFinishReason::EndOfSequenceToken,
|
||||||
|
_ => panic!("Cannot convert {reason:?} to text_generation_router::FinishReason"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -11,7 +11,7 @@ use text_generation_router::server::{
|
|||||||
get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
|
get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
|
||||||
};
|
};
|
||||||
use text_generation_router::usage_stats::UsageStatsLevel;
|
use text_generation_router::usage_stats::UsageStatsLevel;
|
||||||
use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
|
use text_generation_router::{server, Tokenizer};
|
||||||
|
|
||||||
/// App Configuration
|
/// App Configuration
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
@ -69,7 +69,7 @@ struct Args {
|
|||||||
|
|
||||||
async fn get_tokenizer(
|
async fn get_tokenizer(
|
||||||
tokenizer_name: &str,
|
tokenizer_name: &str,
|
||||||
tokenizer_config_path: Option<&str>,
|
_tokenizer_config_path: Option<&str>,
|
||||||
revision: Option<&str>,
|
revision: Option<&str>,
|
||||||
) -> Option<Tokenizer> {
|
) -> Option<Tokenizer> {
|
||||||
// Parse Huggingface hub token
|
// Parse Huggingface hub token
|
||||||
|
@ -8,13 +8,13 @@
|
|||||||
|
|
||||||
#include "backend.hpp"
|
#include "backend.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
using namespace huggingface::tgi::backends::trtllm;
|
using namespace huggingface::tgi::backends::trtllm;
|
||||||
|
|
||||||
TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
||||||
{
|
{
|
||||||
const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
|
const json config_j = {{"temperature", 0.6},
|
||||||
|
{"top_p", 0.95},
|
||||||
|
{"eos_token_id", {1, 2, 3}}};
|
||||||
const auto generation_config = generation_config_t(config_j);
|
const auto generation_config = generation_config_t(config_j);
|
||||||
|
|
||||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
|
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
|
||||||
@ -24,8 +24,9 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
|||||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||||
|
|
||||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
|
||||||
{
|
{2},
|
||||||
|
{3}})) {
|
||||||
// Currently we do not support multi-tokens stop words
|
// Currently we do not support multi-tokens stop words
|
||||||
REQUIRE(lhs.size() == 1);
|
REQUIRE(lhs.size() == 1);
|
||||||
REQUIRE(rhs.size() == 1);
|
REQUIRE(rhs.size() == 1);
|
||||||
@ -35,7 +36,7 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
|||||||
|
|
||||||
TEST_CASE("parse generation_config.json default", "[generation_config_t]")
|
TEST_CASE("parse generation_config.json default", "[generation_config_t]")
|
||||||
{
|
{
|
||||||
const json config_j = {{"eos_token_id", {1,2,3}}};
|
const json config_j = {{"eos_token_id", {1, 2, 3}}};
|
||||||
const auto generation_config = generation_config_t(config_j);
|
const auto generation_config = generation_config_t(config_j);
|
||||||
|
|
||||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||||
@ -44,8 +45,9 @@ TEST_CASE("parse generation_config.json default", "[generation_config_t]")
|
|||||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||||
|
|
||||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
|
||||||
{
|
{2},
|
||||||
|
{3}})) {
|
||||||
// Currently we do not support multi-tokens stop words
|
// Currently we do not support multi-tokens stop words
|
||||||
REQUIRE(lhs.size() == 1);
|
REQUIRE(lhs.size() == 1);
|
||||||
REQUIRE(rhs.size() == 1);
|
REQUIRE(rhs.size() == 1);
|
||||||
|
Loading…
Reference in New Issue
Block a user