mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Merge branch 'main' into flash_decoding
This commit is contained in:
commit
ac67673788
75
.devcontainer/Dockerfile_trtllm
Normal file
75
.devcontainer/Dockerfile_trtllm
Normal file
@ -0,0 +1,75 @@
|
||||
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
|
||||
ARG OMPI_VERSION="4.1.7rc1"
|
||||
|
||||
# Build dependencies resolver stage
|
||||
FROM lukemathwalker/cargo-chef:latest AS chef
|
||||
WORKDIR /usr/src/text-generation-inference/backends/trtllm
|
||||
|
||||
FROM chef AS planner
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# CUDA dependent dependencies resolver stage
|
||||
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt update && apt install -y \
|
||||
build-essential \
|
||||
cmake \
|
||||
curl \
|
||||
gcc-14 \
|
||||
g++-14 \
|
||||
git \
|
||||
git-lfs \
|
||||
libssl-dev \
|
||||
libucx-dev \
|
||||
ninja-build \
|
||||
pkg-config \
|
||||
pipx \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-setuptools \
|
||||
tar \
|
||||
wget && \
|
||||
pipx ensurepath
|
||||
|
||||
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
|
||||
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
|
||||
|
||||
# Install OpenMPI
|
||||
FROM cuda-builder AS mpi-builder
|
||||
ARG OMPI_VERSION
|
||||
|
||||
ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
|
||||
RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
|
||||
mkdir /usr/src/mpi && \
|
||||
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
|
||||
cd /usr/src/mpi && \
|
||||
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
|
||||
make -j all && \
|
||||
make install && \
|
||||
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
|
||||
|
||||
# Install TensorRT
|
||||
FROM cuda-builder AS trt-builder
|
||||
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
|
||||
RUN chmod +x /opt/install_tensorrt.sh && \
|
||||
/opt/install_tensorrt.sh
|
||||
|
||||
# Build Backend
|
||||
FROM cuda-builder AS tgi-builder
|
||||
WORKDIR /usr/src/text-generation-inference
|
||||
|
||||
# Install Rust
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
|
||||
chmod -R a+w /root/.rustup && \
|
||||
chmod -R a+w /root/.cargo
|
||||
|
||||
ENV PATH="/root/.cargo/bin:$PATH"
|
||||
RUN cargo install cargo-chef
|
||||
|
||||
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
||||
|
||||
ENV MPI_HOME=/usr/local/mpi
|
@ -0,0 +1,19 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
|
||||
{
|
||||
"name": "CUDA",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile_trtllm",
|
||||
"context": ".."
|
||||
},
|
||||
"remoteEnv": {
|
||||
"PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
|
||||
"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
|
||||
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
|
||||
},
|
||||
"customizations" : {
|
||||
"jetbrains" : {
|
||||
"backend" : "CLion"
|
||||
}
|
||||
}
|
||||
}
|
12
.github/workflows/build.yaml
vendored
12
.github/workflows/build.yaml
vendored
@ -8,6 +8,7 @@ on:
|
||||
description: Hardware
|
||||
# options:
|
||||
# - cuda
|
||||
# - cuda-trtllm
|
||||
# - rocm
|
||||
# - intel
|
||||
required: true
|
||||
@ -52,6 +53,15 @@ jobs:
|
||||
export platform=""
|
||||
export extra_pytest=""
|
||||
;;
|
||||
cuda-trtllm)
|
||||
export dockerfile="Dockerfile_trtllm"
|
||||
export label_extension="-trtllm"
|
||||
export docker_volume="/mnt/cache"
|
||||
export docker_devices=""
|
||||
export runs_on="ubuntu-latest"
|
||||
export platform=""
|
||||
export extra_pytest=""
|
||||
;;
|
||||
rocm)
|
||||
export dockerfile="Dockerfile_amd"
|
||||
export label_extension="-rocm"
|
||||
@ -137,7 +147,7 @@ jobs:
|
||||
uses: docker/metadata-action@v4.3.0
|
||||
with:
|
||||
flavor: |
|
||||
latest=auto
|
||||
latest=false
|
||||
images: |
|
||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||
ghcr.io/huggingface/text-generation-inference
|
||||
|
2
.github/workflows/ci_build.yaml
vendored
2
.github/workflows/ci_build.yaml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
# fail-fast is true by default
|
||||
fail-fast: false
|
||||
matrix:
|
||||
hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
|
||||
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
|
||||
uses: ./.github/workflows/build.yaml # calls the one above ^
|
||||
permissions:
|
||||
contents: write
|
||||
|
69
Cargo.lock
generated
69
Cargo.lock
generated
@ -2850,20 +2850,6 @@ dependencies = [
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.13.0"
|
||||
@ -2963,24 +2949,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"once_cell",
|
||||
"opentelemetry 0.24.0",
|
||||
"percent-encoding",
|
||||
"rand",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
@ -4367,9 +4335,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-backends-trtllm"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"clap 4.5.21",
|
||||
"cmake",
|
||||
@ -4377,21 +4344,19 @@ dependencies = [
|
||||
"cxx-build",
|
||||
"hashbrown 0.14.5",
|
||||
"hf-hub",
|
||||
"log",
|
||||
"pkg-config",
|
||||
"pyo3",
|
||||
"text-generation-router",
|
||||
"thiserror",
|
||||
"tokenizers",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-opentelemetry 0.25.0",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-benchmark"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"average",
|
||||
"clap 4.5.21",
|
||||
@ -4411,7 +4376,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-client"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
@ -4429,7 +4394,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-launcher"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"clap 4.5.21",
|
||||
"ctrlc",
|
||||
@ -4450,7 +4415,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-router"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
@ -4501,7 +4466,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-router-v2"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@ -4550,7 +4515,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-router-v3"
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@ -5086,24 +5051,6 @@ dependencies = [
|
||||
"web-time 0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"opentelemetry 0.24.0",
|
||||
"opentelemetry_sdk 0.24.1",
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log 0.2.0",
|
||||
"tracing-subscriber",
|
||||
"web-time 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry-instrumentation-sdk"
|
||||
version = "0.16.0"
|
||||
|
@ -20,7 +20,7 @@ default-members = [
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "3.0.1-dev0"
|
||||
version = "3.0.2-dev0"
|
||||
edition = "2021"
|
||||
authors = ["Olivier Dehaene"]
|
||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||
|
@ -234,6 +234,7 @@ FROM kernel-builder AS vllm-builder
|
||||
WORKDIR /usr/src
|
||||
|
||||
COPY server/Makefile-vllm Makefile
|
||||
RUN pip install setuptools_scm
|
||||
|
||||
# Build specific version of vllm
|
||||
RUN make build-vllm-rocm
|
||||
@ -267,6 +268,15 @@ COPY server/exllamav2_kernels/ .
|
||||
|
||||
RUN python setup.py build
|
||||
|
||||
FROM kernel-builder AS moe-kernels
|
||||
WORKDIR /usr/src
|
||||
ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
|
||||
ENV VLLM_TARGET_DEVICE=rocm
|
||||
RUN git clone https://github.com/danieldk/moe-kernels.git && \
|
||||
cd moe-kernels && \
|
||||
git checkout ${MOE_KERNELS_BRANCH} && \
|
||||
python setup.py install
|
||||
|
||||
FROM install_deps AS base-copy
|
||||
|
||||
# Text Generation Inference base env
|
||||
@ -289,6 +299,9 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
|
||||
# Copy build artifacts from exllamav2 kernels builder
|
||||
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from moe kernels
|
||||
COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
|
@ -97,10 +97,10 @@ ENV HF_HOME=/data \
|
||||
|
||||
|
||||
WORKDIR /usr/src
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
|
||||
|
||||
RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
|
||||
ARG OMPI_VERSION="4.1.6"
|
||||
ARG OMPI_VERSION="4.1.7rc1"
|
||||
|
||||
# Build dependencies resolver stage
|
||||
FROM lukemathwalker/cargo-chef:latest AS chef
|
||||
@ -10,7 +10,7 @@ COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# CUDA dependent dependencies resolver stage
|
||||
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
|
||||
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
@ -18,18 +18,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
build-essential \
|
||||
cmake \
|
||||
curl \
|
||||
gcc \
|
||||
g++ \
|
||||
gcc-14 \
|
||||
g++-14 \
|
||||
git \
|
||||
git-lfs \
|
||||
libssl-dev \
|
||||
libucx-dev \
|
||||
ninja-build \
|
||||
pkg-config \
|
||||
pipx \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-setuptools \
|
||||
tar \
|
||||
wget
|
||||
wget && \
|
||||
pipx ensurepath
|
||||
|
||||
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
|
||||
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
|
||||
@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
|
||||
cd backends/trtllm && \
|
||||
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
|
||||
|
||||
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
|
||||
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
|
||||
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
|
||||
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
|
||||
python3 -m pip install transformers tokenizers
|
||||
pipx ensurepath && \
|
||||
pipx install --include-deps transformers tokenizers
|
||||
|
||||
WORKDIR /usr/local/tgi/bin
|
||||
|
||||
ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
|
||||
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
ENV OMPI_MCA_plm_rsh_agent=""
|
||||
|
18
README.md
18
README.md
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
|
||||
|
||||
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
|
||||
|
||||
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
|
||||
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
|
||||
|
||||
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
|
||||
```
|
||||
@ -196,14 +196,26 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
|
||||
|
||||
You can also opt to install `text-generation-inference` locally.
|
||||
|
||||
First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
|
||||
Python 3.9, e.g. using `conda`:
|
||||
First clone the repository and change directoy into it:
|
||||
|
||||
```shell
|
||||
git clone https://github.com/huggingface/text-generation-inference
|
||||
cd text-generation-inference
|
||||
```
|
||||
|
||||
Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
|
||||
Python 3.9, e.g. using `conda` or `python venv`:
|
||||
|
||||
```shell
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
|
||||
#using conda
|
||||
conda create -n text-generation-inference python=3.11
|
||||
conda activate text-generation-inference
|
||||
|
||||
#using pyton venv
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
You may also need to install Protoc.
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 201 KiB After Width: | Height: | Size: 209 KiB |
@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
||||
endif ()
|
||||
|
||||
project(tgi-trtllm-backend VERSION 1.0.0)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD 23)
|
||||
|
||||
include(FetchContent)
|
||||
include(ExternalProject)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
|
||||
@ -29,11 +30,20 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
|
||||
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
|
||||
|
||||
#### External dependencies ####
|
||||
include(cmake/fmt.cmake)
|
||||
include(cmake/json.cmake)
|
||||
include(cmake/spdlog.cmake)
|
||||
include(cmake/trtllm.cmake)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
|
||||
endif()
|
||||
|
||||
# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
|
||||
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
|
||||
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
|
||||
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
|
||||
endif()
|
||||
|
||||
# Let's build TRTLLM as part of CMake
|
||||
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
||||
|
||||
@ -41,15 +51,21 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
||||
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
|
||||
|
||||
# TGI TRTLLM Backend definition
|
||||
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
|
||||
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
|
||||
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<INSTALL_INTERFACE:include>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
|
||||
# $<INSTALL_INTERFACE:csrc>
|
||||
)
|
||||
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||
else()
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
||||
endif ()
|
||||
|
||||
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
|
||||
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
|
||||
@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
|
||||
message(STATUS "Building tests")
|
||||
FetchContent_Declare(
|
||||
Catch2
|
||||
GIT_REPOSITORY https://github.com/catchorg/Catch2
|
||||
GIT_TAG v3.6.0
|
||||
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
|
||||
)
|
||||
FetchContent_MakeAvailable(Catch2)
|
||||
|
||||
# add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
|
||||
# target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
|
||||
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
|
||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||
else()
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
||||
endif ()
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
||||
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
|
||||
include(CTest)
|
||||
include(Catch)
|
||||
# catch_discover_tests(tgi_trtllm_backend_tests)
|
||||
catch_discover_tests(tgi_trtllm_backend_tests)
|
||||
endif ()
|
||||
|
@ -7,20 +7,21 @@ homepage.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
async-stream = "0.3"
|
||||
#async-stream = "0.3"
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
cxx = "1.0"
|
||||
hashbrown = "0.14"
|
||||
hf-hub = { workspace = true }
|
||||
log = { version = "0.4", features = [] }
|
||||
#log = { version = "0.4", features = [] }
|
||||
text-generation-router = { path = "../../router" }
|
||||
tokenizers = { workspace = true }
|
||||
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
|
||||
tokio-stream = "0.1.15"
|
||||
thiserror = "1.0.63"
|
||||
tracing = "0.1"
|
||||
tracing-opentelemetry = "0.25"
|
||||
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||
#tracing-opentelemetry = "0.25"
|
||||
#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||
pyo3 = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
cmake = "0.1"
|
||||
|
@ -4,7 +4,7 @@ use std::env;
|
||||
use std::env::consts::ARCH;
|
||||
use std::path::{absolute, PathBuf};
|
||||
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||
const CUDA_REQUIRED_VERSION: &str = "12.6";
|
||||
const MPI_REQUIRED_VERSION: &str = "4.1";
|
||||
@ -43,7 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
||||
install_path = absolute(out_dir).expect("cannot happen").join(install_path);
|
||||
}
|
||||
|
||||
let _ = cmake::Config::new(".")
|
||||
let mut config = cmake::Config::new(".");
|
||||
config
|
||||
.uses_cxx11()
|
||||
.generator("Ninja")
|
||||
.profile(match is_debug {
|
||||
@ -53,9 +54,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
||||
.env("OPT_LEVEL", opt_level)
|
||||
.define("CMAKE_INSTALL_PREFIX", &install_path)
|
||||
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
|
||||
.define("Python3_ROOT_DIR", "../venv")
|
||||
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
|
||||
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
|
||||
.build();
|
||||
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
|
||||
|
||||
// Allow to override which Python to use ...
|
||||
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
|
||||
config.define("Python3_EXECUTABLE", python3);
|
||||
}
|
||||
|
||||
config.build();
|
||||
|
||||
// Additional transitive CMake dependencies
|
||||
let deps_folder = out_dir.join("build").join("_deps");
|
||||
@ -90,26 +98,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
||||
CFG.include_prefix = "backends/trtllm";
|
||||
cxx_build::bridge("src/lib.rs")
|
||||
.static_flag(true)
|
||||
.include(deps_folder.join("fmt-src").join("include"))
|
||||
.std("c++23")
|
||||
.include(deps_folder.join("spdlog-src").join("include"))
|
||||
.include(deps_folder.join("json-src").join("include"))
|
||||
.include(deps_folder.join("trtllm-src").join("cpp").join("include"))
|
||||
.include("/usr/local/cuda/include")
|
||||
.include("/usr/local/tensorrt/include")
|
||||
.file("src/ffi.cpp")
|
||||
.std("c++20")
|
||||
.define("NDEBUG", ndebug)
|
||||
.include("csrc/")
|
||||
.file("csrc/ffi.hpp")
|
||||
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
|
||||
.compile("tgi_trtllm_backend");
|
||||
|
||||
println!("cargo:rerun-if-changed=CMakeLists.txt");
|
||||
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/json.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
|
||||
println!("cargo:rerun-if-changed=include/backend.h");
|
||||
println!("cargo:rerun-if-changed=lib/backend.cpp");
|
||||
println!("cargo:rerun-if-changed=include/ffi.h");
|
||||
println!("cargo:rerun-if-changed=src/ffi.cpp");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.hpp");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.cpp");
|
||||
println!("cargo:rerun-if-changed=csrc/hardware.hpp");
|
||||
println!("cargo:rerun-if-changed=csrc/ffi.hpp");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
@ -1,6 +0,0 @@
|
||||
FetchContent_Declare(
|
||||
fmt
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
|
||||
)
|
||||
FetchContent_MakeAvailable(fmt)
|
@ -1,6 +1,6 @@
|
||||
fetchcontent_declare(
|
||||
json
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
|
||||
# DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
|
||||
)
|
||||
fetchcontent_makeavailable(json)
|
||||
|
@ -1,6 +1,6 @@
|
||||
set(SPDLOG_USE_FMT ON)
|
||||
set(SPDLOG_BUILD_SHARED OFF)
|
||||
set(SPDLOG_FMT_EXTERNAL ON)
|
||||
set(SPDLOG_FMT_EXTERNAL OFF)
|
||||
|
||||
# Define the level at which SPDLOG_ compilation level is defined
|
||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
@ -11,7 +11,7 @@ endif ()
|
||||
|
||||
fetchcontent_declare(
|
||||
spdlog
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
# DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
|
||||
)
|
||||
fetchcontent_makeavailable(spdlog)
|
||||
|
@ -11,6 +11,7 @@ set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
|
||||
|
||||
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
set(ENABLE_UCX OFF)
|
||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
set(FAST_BUILD ON)
|
||||
set(NVTX_DISABLE OFF)
|
||||
@ -20,11 +21,13 @@ else ()
|
||||
set(NVTX_DISABLE ON)
|
||||
endif ()
|
||||
|
||||
find_package(Python3 REQUIRED Interpreter)
|
||||
|
||||
fetchcontent_declare(
|
||||
trtllm
|
||||
GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
|
||||
GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
|
||||
GIT_SHALLOW FALSE
|
||||
GIT_REPOSITORY https://github.com/huggingface/TensorRT-LLM.git
|
||||
GIT_TAG 1bb9ca4688805444f203647674bac1d7219d0579
|
||||
GIT_SHALLOW ON
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
)
|
||||
fetchcontent_makeavailable(trtllm)
|
||||
|
79
backends/trtllm/csrc/backend.cpp
Normal file
79
backends/trtllm/csrc/backend.cpp
Normal file
@ -0,0 +1,79 @@
|
||||
#include <ranges>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "backend.hpp"
|
||||
#include "hardware.hpp"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
|
||||
auto mode = tle::CommunicationMode::kLEADER;
|
||||
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||
|
||||
if (world_size > 1) {
|
||||
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
|
||||
} else {
|
||||
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||
}
|
||||
|
||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||
}
|
||||
|
||||
|
||||
tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||
// Retrieve the compute capabilities to enable some options at runtime
|
||||
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
||||
|
||||
// Allocate the config
|
||||
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
|
||||
|
||||
// Set the parallel config as inferred
|
||||
executor_config.setParallelConfig(parallel_config());
|
||||
|
||||
// Define some configuration variables
|
||||
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
|
||||
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
||||
return executor_config;
|
||||
}
|
||||
|
||||
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
||||
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
||||
|
||||
size_t backend_t::num_tokens_ready() const noexcept {
|
||||
return executor_.getNumResponsesReady();
|
||||
}
|
||||
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
||||
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
||||
return executor_.enqueueRequest(tle::Request {
|
||||
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
||||
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
|
||||
true,
|
||||
(tle::SamplingConfig) sampling_params,
|
||||
tle::OutputConfig { /* returnLogProbs= */ true },
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
workspace.generation_config().stop_words
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
|
||||
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
|
||||
return executor_.awaitResponses();
|
||||
}
|
||||
|
||||
void backend_t::cancel(request_id_t request_id) noexcept {
|
||||
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
||||
executor_.cancelRequest(request_id);
|
||||
}
|
||||
}
|
231
backends/trtllm/csrc/backend.hpp
Normal file
231
backends/trtllm/csrc/backend.hpp
Normal file
@ -0,0 +1,231 @@
|
||||
#ifndef TGI_BACKEND_TRTLLM
|
||||
#define TGI_BACKEND_TRTLLM
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <expected>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <span>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/fmt/fmt.h>
|
||||
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
using json = nlohmann::json;
|
||||
using request_id_t = uint64_t;
|
||||
using token_id_t = tle::TokenIdType;
|
||||
|
||||
/**
|
||||
* Represent the parameters used for generation
|
||||
*/
|
||||
struct generation_params_t {
|
||||
uint32_t max_new_tokens;
|
||||
};
|
||||
|
||||
/**
|
||||
* Represent the parameters used to sample tokens from the logit distribution
|
||||
*/
|
||||
struct sampling_params_t {
|
||||
uint32_t top_k;
|
||||
float_t top_p;
|
||||
float_t repetition_penalty;
|
||||
float_t frequency_penalty;
|
||||
float_t temperature;
|
||||
uint64_t seed;
|
||||
|
||||
constexpr explicit operator tle::SamplingConfig() const {
|
||||
return tle::SamplingConfig{
|
||||
1,
|
||||
top_k,
|
||||
top_p,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
temperature,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
repetition_penalty,
|
||||
std::nullopt,
|
||||
frequency_penalty,
|
||||
std::nullopt
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Represent possible values from transformers generation `generation_config.json`.
|
||||
* It usually stores default sampling parameters to use, such as top_p, temperature, etc.
|
||||
*/
|
||||
struct generation_config_t {
|
||||
float_t top_p;
|
||||
float_t temperature;
|
||||
std::list<std::vector<int32_t>> stop_words;
|
||||
|
||||
constexpr explicit generation_config_t(const json &config) :
|
||||
top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
|
||||
if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
|
||||
const auto &eos_token_id = config["/eos_token_id"_json_pointer];
|
||||
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
|
||||
stop_words.emplace_back(1, token_id.template get<int32_t>());
|
||||
});
|
||||
|
||||
SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Helper class representing various items which are stored within the TensorRT-LLM engines folder and
|
||||
* can be retrieved at runtime
|
||||
*/
|
||||
class backend_workspace_t {
|
||||
private:
|
||||
constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
|
||||
std::ifstream config_f(path);
|
||||
return json::parse(config_f);
|
||||
};
|
||||
|
||||
std::filesystem::path engines_folder_;
|
||||
std::filesystem::path executor_worker_path_;
|
||||
json config_;
|
||||
generation_config_t generation_config_;
|
||||
|
||||
public:
|
||||
backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
|
||||
engines_folder_(engines_folder),
|
||||
executor_worker_path_(executor_worker_path),
|
||||
config_(as_json(engines_folder / "config.json")),
|
||||
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||
|
||||
backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
|
||||
engines_folder_(engines_folder),
|
||||
executor_worker_path_(executor_worker_path),
|
||||
config_(as_json(engines_folder / "config.json")),
|
||||
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||
|
||||
/**
|
||||
* Path to the folder containing the TensorRT-LLM engines
|
||||
* @return local filesystem path to the folder
|
||||
*/
|
||||
[[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }
|
||||
|
||||
/**
|
||||
* Hugging Face transformers' generated `generation_config_t` mapping information stored in the
|
||||
* `generation_config.json` holding default generation parameters.
|
||||
* @return `generation_config_t`
|
||||
*/
|
||||
[[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }
|
||||
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
|
||||
* @return `tensorrt_llm::executor::ParallelConfig` instance
|
||||
*/
|
||||
[[nodiscard]] tle::ParallelConfig parallel_config() const;
|
||||
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor`
|
||||
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
||||
*/
|
||||
[[nodiscard]] tle::ExecutorConfig executor_config() const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Error raised by the underlying backend implementation
|
||||
*/
|
||||
enum backend_error_t {
|
||||
EXECUTOR_NOT_READY = 3,
|
||||
EXECUTOR_SCHEDULING_FAILED = 4,
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
|
||||
* - schedule new request
|
||||
* - pull status of submitted request(s)
|
||||
* - cancel submitted request(s)
|
||||
*/
|
||||
class backend_t {
|
||||
private:
|
||||
backend_workspace_t workspace;
|
||||
tle::Executor executor_;
|
||||
|
||||
public:
|
||||
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
|
||||
|
||||
backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
|
||||
: backend_t(engines_folder, executor_worker_path) {};
|
||||
|
||||
/**
|
||||
* Submit a new request to the executor
|
||||
* @param token_ids
|
||||
* @param generation_params
|
||||
* @param sampling_params
|
||||
* @return Either newly submitted request's id or the error why it failed to submit
|
||||
*/
|
||||
[[nodiscard("Discarded executor request_id needs to be assigned")]]
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
|
||||
sampling_params_t sampling_params) noexcept;
|
||||
|
||||
/**
|
||||
* Query the number of tokens available across all in-flight generations
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("Pulling out the number of tokens")]]
|
||||
size_t num_tokens_ready() const noexcept;
|
||||
|
||||
/**
|
||||
* Pull out newly generated tokens from the executor
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("")]]
|
||||
std::vector<tle::Response> pull_tokens() noexcept;
|
||||
|
||||
/**
|
||||
* Cancel the specified request on the executor' set
|
||||
* @param request_id Request's Identifier to remove from the in-flight executor
|
||||
*/
|
||||
void cancel(request_id_t) noexcept;
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a TensorRT-LLM executor from a workspace
|
||||
*/
|
||||
const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
|
||||
return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
|
||||
workspace.executor_config()};
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper structures to define formatting strategies for various types in the backend
|
||||
*/
|
||||
template<>
|
||||
struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
|
||||
format_context &ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
|
||||
format_context &ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(
|
||||
ctx.out(),
|
||||
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
162
backends/trtllm/csrc/ffi.hpp
Normal file
162
backends/trtllm/csrc/ffi.hpp
Normal file
@ -0,0 +1,162 @@
|
||||
#ifndef TGI_BACKEND_TRTLLM_FFI
|
||||
#define TGI_BACKEND_TRTLLM_FFI
|
||||
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include <nvml.h>
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <backend.hpp>
|
||||
#include <hardware.hpp>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
||||
func();
|
||||
} catch (tensorrt_llm::common::TllmException &e) {
|
||||
fail(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
class tensorrt_llm_backend_t;
|
||||
}
|
||||
|
||||
#include "backends/trtllm/src/lib.rs.h"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
std::once_flag backend_initialized_flag;
|
||||
|
||||
class tensorrt_llm_backend_t {
|
||||
private:
|
||||
backend_t inner_;
|
||||
|
||||
public:
|
||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||
: inner_(engine_folder, executor_worker_path) {}
|
||||
|
||||
size_t num_tokens_ready() const noexcept {
|
||||
return inner_.num_tokens_ready();
|
||||
}
|
||||
|
||||
request_id_t submit(
|
||||
rust::Slice<const uint32_t> tokens,
|
||||
uint32_t max_new_tokens,
|
||||
uint32_t top_k,
|
||||
float_t top_p,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed
|
||||
) {
|
||||
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
|
||||
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
|
||||
|
||||
// Submit the request to the executor and get back a potential request_id used to track request status
|
||||
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
|
||||
const auto maybe_request_id = inner_.submit(
|
||||
signed_tokens,
|
||||
{max_new_tokens},
|
||||
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
||||
);
|
||||
|
||||
// If we do have a value, let's return the request_id
|
||||
if(maybe_request_id.has_value()) [[likely]] {
|
||||
return *maybe_request_id;
|
||||
} else {
|
||||
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
|
||||
return maybe_request_id.error();
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
|
||||
if(num_tokens_ready() > 0) [[likely]] {
|
||||
const auto responses = inner_.pull_tokens();
|
||||
|
||||
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
||||
// Transform tle::Response to GenerationStep
|
||||
auto steps = std::make_unique<std::vector<generation_step_t>>();
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) [[likely]] {
|
||||
const auto result = r.getResult();
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
return steps;
|
||||
|
||||
} else {
|
||||
return std::make_unique<std::vector<generation_step_t>>();
|
||||
}
|
||||
}
|
||||
|
||||
void cancel(request_id_t requestId) noexcept {
|
||||
SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
|
||||
inner_.cancel(requestId);
|
||||
}
|
||||
};
|
||||
|
||||
void initialize_logging() {
|
||||
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
if (log_level == "debug")
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
else
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
#else
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
#endif
|
||||
}
|
||||
|
||||
void initialize_tensorrt_llm_backend() {
|
||||
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
|
||||
|
||||
// Initialize everyone
|
||||
initialize_logging();
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
|
||||
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
|
||||
if (numGpus.has_value()) {
|
||||
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
|
||||
} else {
|
||||
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
|
||||
// todo: throw
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||
return std::make_unique<tensorrt_llm_backend_t>(
|
||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
||||
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
81
backends/trtllm/csrc/hardware.hpp
Normal file
81
backends/trtllm/csrc/hardware.hpp
Normal file
@ -0,0 +1,81 @@
|
||||
#ifndef TGI_HARDWARE_CUDA
|
||||
#define TGI_HARDWARE_CUDA
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
namespace huggingface::tgi::hardware::cuda {
|
||||
static constexpr auto VOLTA = std::make_tuple(7u, 0u);
|
||||
static constexpr auto TURING = std::make_tuple(7u, 5u);
|
||||
static constexpr auto AMPERE = std::make_tuple(8u, 0u);
|
||||
static constexpr auto HOPPER = std::make_tuple(9u, 0u);
|
||||
static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);
|
||||
|
||||
/**
|
||||
* Get the number of GPUs on the local machine
|
||||
* @return std::nullopt if no device is available, otherwise >= 1
|
||||
*/
|
||||
inline std::optional<size_t> get_device_count() {
|
||||
uint32_t numGpus = 0;
|
||||
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
||||
return numGpus;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
||||
*/
|
||||
struct compute_capabilities_t {
|
||||
int32_t major;
|
||||
int32_t minor;
|
||||
|
||||
compute_capabilities_t(): compute_capabilities_t(0) {}
|
||||
explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
|
||||
nvmlDevice_t device;
|
||||
if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
|
||||
nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
|
||||
}
|
||||
};
|
||||
compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}
|
||||
|
||||
/**
|
||||
* Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
|
||||
* @param sm Architecture version (major, minor)
|
||||
* @return True if greater or equals to the underlying compute capabilities
|
||||
*/
|
||||
[[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Volta architecture (sm_70)
|
||||
* @return true if at least Volta (>= sm_70), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Turing architecture (sm_75)
|
||||
* @return true if at least Turing (>= sm_75), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Ampere architecture (sm_80)
|
||||
* @return true if at least Ampere (>= sm_80), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Ada Lovelace architecture (sm_89)
|
||||
* @return true if at least Ada Lovelace (>= sm_89), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Hopper architecture (sm_90)
|
||||
* @return true if at least Hopper (>= sm_90), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
||||
};
|
||||
}
|
||||
#endif
|
@ -1,144 +0,0 @@
|
||||
//
|
||||
// Created by Morgan Funtowicz on 6/30/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_H
|
||||
#define TGI_TRTLLM_BACKEND_H
|
||||
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <filesystem>
|
||||
#include <span>
|
||||
#include <vector>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <tensorrt_llm/runtime/common.h>
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
using json = nlohmann::json;
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
|
||||
|
||||
#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
using RequestId = tle::IdType;
|
||||
using TokenId = tle::TokenIdType;
|
||||
|
||||
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
|
||||
constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
|
||||
"Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
|
||||
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
|
||||
"Submitting inference [{}] to the executor ({:d} already in-flight)");
|
||||
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
|
||||
"Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
|
||||
|
||||
/**
|
||||
* Initialize all the components required by TRTLLM.
|
||||
* It is required to call this function before attempting to load any engine
|
||||
*/
|
||||
void InitializeBackend();
|
||||
|
||||
/**
|
||||
* Initialize logging mechanism
|
||||
*/
|
||||
void InitializeLogging();
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param config TensorRT-LLM configuration object
|
||||
* @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
|
||||
* @return
|
||||
*/
|
||||
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param worldSize
|
||||
* @param workerPath
|
||||
* @return
|
||||
*/
|
||||
tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
|
||||
|
||||
/**
|
||||
* Get the sampling configuration from the parameters provided by TGI
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetition_penalty
|
||||
* @param frequency_penalty
|
||||
* @param seed
|
||||
* @return
|
||||
*/
|
||||
tle::SamplingConfig GetSamplingConfig(
|
||||
uint32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed
|
||||
) noexcept;
|
||||
|
||||
/**
|
||||
* Attempt to retrieve the
|
||||
* @param generationConfigPath
|
||||
* @return
|
||||
*/
|
||||
std::optional<std::list<std::vector<TokenId>>>
|
||||
GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
class TensorRtLlmBackend {
|
||||
private:
|
||||
const json config;
|
||||
tle::Executor executor;
|
||||
|
||||
/** Frequently accessed variables cached here **/
|
||||
uint32_t maxNumTokens;
|
||||
std::list<std::vector<TokenId>> stopWords;
|
||||
|
||||
public:
|
||||
explicit TensorRtLlmBackend(
|
||||
const std::filesystem::path &engineFolder,
|
||||
const std::filesystem::path &executorWorker
|
||||
);
|
||||
|
||||
/**
|
||||
* Query the executor for the number of token available for pulling
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard]] size_t NumResponsesReady() const;
|
||||
|
||||
/**
|
||||
* Submit a new generation task to the executor
|
||||
* @param tokens
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetitionPenalty
|
||||
* @param frequencyPenalty
|
||||
* @param seed
|
||||
* @return Request id related to this generation for reference
|
||||
*/
|
||||
[[nodiscard]] RequestId Submit(
|
||||
const std::vector<TokenId> &tokens,
|
||||
uint32_t maxNewTokens,
|
||||
int32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetitionPenalty,
|
||||
float_t frequencyPenalty,
|
||||
uint64_t seed
|
||||
);
|
||||
|
||||
[[nodiscard]] std::vector<tle::Response> PullNewTokens();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_H
|
@ -1,75 +0,0 @@
|
||||
//
|
||||
// Created by mfuntowicz on 7/11/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_FFI_H
|
||||
#define TGI_TRTLLM_BACKEND_FFI_H
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include "backend.h"
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
class TensorRtLlmBackendImpl;
|
||||
}
|
||||
|
||||
// Template to support returning error from TllmException back to Rust in a Result<>
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
||||
func();
|
||||
} catch (tensorrt_llm::common::TllmException &e) {
|
||||
fail(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
#include "backends/trtllm/src/lib.rs.h"
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
|
||||
class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
|
||||
public:
|
||||
/***
|
||||
*
|
||||
* @param engineFolder
|
||||
* @param executorWorker
|
||||
*/
|
||||
TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
|
||||
|
||||
/***
|
||||
*
|
||||
* @param tokens
|
||||
* @param maxNewTokens
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetition_penalty
|
||||
* @param frequency_penalty
|
||||
* @param seed
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
|
||||
uint64_t
|
||||
Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
|
||||
int32_t topK, float_t topP, float_t temperature,
|
||||
float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
|
||||
|
||||
/***
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
std::unique_ptr<std::vector<GenerationStep>> PullTokens();
|
||||
};
|
||||
|
||||
/***
|
||||
*
|
||||
* @param engineFolder
|
||||
* @return
|
||||
*/
|
||||
std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
|
||||
}
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_FFI_H
|
@ -1,59 +0,0 @@
|
||||
//
|
||||
// Created by mfuntowicz on 7/23/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
|
||||
#define TGI_TRTLLM_BACKEND_HARDWARE_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <fmt/base.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <nvml.h>
|
||||
|
||||
namespace huggingface::hardware::cuda {
|
||||
|
||||
#define AMPERE_SM_MAJOR 8
|
||||
#define HOPPER_SM_MAJOR 9
|
||||
|
||||
/**
|
||||
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
||||
*/
|
||||
struct CudaComputeCapabilities {
|
||||
int32_t major;
|
||||
int32_t minor;
|
||||
|
||||
[[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
|
||||
|
||||
[[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
|
||||
};
|
||||
|
||||
CudaComputeCapabilities GetCudaComputeCapabilities() {
|
||||
// Get the compute capabilities of the current hardware
|
||||
nvmlDevice_t device;
|
||||
CudaComputeCapabilities capabilities{0, 0};
|
||||
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
||||
if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
|
||||
SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
|
||||
}
|
||||
}
|
||||
|
||||
return capabilities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of GPU detected. If no GPU is detected, return size_t::max()
|
||||
* @return
|
||||
*/
|
||||
std::optional<size_t> GetNumDevices() {
|
||||
uint32_t numGpus = 0;
|
||||
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
||||
return std::optional(numGpus);
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
|
@ -1,203 +0,0 @@
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
|
||||
#include <fmt/ranges.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <nvml.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "hardware.h"
|
||||
|
||||
|
||||
void huggingface::tgi::backends::InitializeLogging() {
|
||||
#ifdef NDEBUG
|
||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
if (log_level == "debug")
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
else
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
#else
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
#endif
|
||||
}
|
||||
|
||||
void huggingface::tgi::backends::InitializeBackend() {
|
||||
SPDLOG_INFO("Initializing Backend...");
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
|
||||
InitializeLogging();
|
||||
|
||||
SPDLOG_INFO("Backend Executor Version: {}", tle::version());
|
||||
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
|
||||
if (numGpus.has_value()) {
|
||||
SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
|
||||
} else {
|
||||
SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]]
|
||||
tle::ParallelConfig
|
||||
huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
|
||||
auto mode = tle::CommunicationMode::kLEADER;
|
||||
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||
|
||||
if (worldSize > 1) {
|
||||
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
|
||||
} else {
|
||||
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||
}
|
||||
|
||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||
}
|
||||
|
||||
[[nodiscard]]
|
||||
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
|
||||
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
|
||||
|
||||
// Retrieve the compute capabilities to enable some options at runtime
|
||||
const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
|
||||
|
||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
|
||||
|
||||
// Define some configuration variables
|
||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
|
||||
execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
||||
return execConfig;
|
||||
}
|
||||
|
||||
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
|
||||
const uint32_t topK,
|
||||
const float_t topP,
|
||||
const float_t temperature,
|
||||
const float_t repetition_penalty,
|
||||
const float_t frequency_penalty,
|
||||
const uint64_t seed) noexcept {
|
||||
|
||||
return tle::SamplingConfig(
|
||||
1, // TGI only use a single beam
|
||||
topK,
|
||||
topP,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
temperature,
|
||||
temperature,
|
||||
std::nullopt,
|
||||
repetition_penalty,
|
||||
std::nullopt,
|
||||
frequency_penalty
|
||||
);
|
||||
}
|
||||
|
||||
std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
|
||||
huggingface::tgi::backends::GetStopWordsFromConfig(
|
||||
const std::filesystem::path &generationConfigPath) noexcept {
|
||||
if (exists(generationConfigPath)) {
|
||||
const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
|
||||
if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
|
||||
SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
|
||||
std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
|
||||
|
||||
const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
|
||||
return {tokenIdObj.template get<tle::TokenIdType>()};
|
||||
};
|
||||
|
||||
std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
|
||||
return stopWords;
|
||||
} else {
|
||||
SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
|
||||
}
|
||||
} else {
|
||||
SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
|
||||
const std::filesystem::path &enginesFolder,
|
||||
const std::filesystem::path &executorWorker
|
||||
) :
|
||||
config(json::parse(std::ifstream(enginesFolder / "config.json"))),
|
||||
executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
|
||||
GetExecutorConfig(config, executorWorker.string())) {
|
||||
|
||||
SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
|
||||
|
||||
// Ensure we have enough GPUs on the system
|
||||
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
|
||||
if (numGpus < worldSize) {
|
||||
SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
|
||||
// todo : raise exception to catch on rust side
|
||||
}
|
||||
|
||||
// Cache variables
|
||||
maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
|
||||
|
||||
// Attempt to discover stopWords from the generation_config.json
|
||||
const auto generationConfigPath = enginesFolder / "generation_config.json";
|
||||
stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
|
||||
}
|
||||
|
||||
[[nodiscard("Returned number of requests needs to be consumed")]]
|
||||
size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
|
||||
#ifdef NDEBUG
|
||||
return executor.getNumResponsesReady();
|
||||
#else
|
||||
const auto numResponses = executor.getNumResponsesReady();
|
||||
if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
|
||||
return numResponses;
|
||||
#endif
|
||||
}
|
||||
|
||||
[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
|
||||
tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
||||
const std::vector<tle::TokenIdType> &tokens,
|
||||
const uint32_t maxNewTokens,
|
||||
const int32_t topK,
|
||||
const float_t topP,
|
||||
const float_t temperature,
|
||||
const float_t repetitionPenalty,
|
||||
const float_t frequencyPenalty,
|
||||
const uint64_t seed
|
||||
) {
|
||||
const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
|
||||
#ifndef NDEBUG
|
||||
{
|
||||
const auto &iterations = executor.getLatestIterationStats();
|
||||
const auto &lastIteration = iterations.front();
|
||||
|
||||
SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
|
||||
SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
|
||||
SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
|
||||
|
||||
// Build the request
|
||||
auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
|
||||
request.setStopWords(stopWords);
|
||||
|
||||
// Submit to the executor for batching
|
||||
return executor.enqueueRequest(request);
|
||||
}
|
||||
|
||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
|
||||
return executor.awaitResponses();
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
|
||||
set -ex
|
||||
|
||||
TRT_VER_BASE="10.4.0"
|
||||
TRT_VER_BASE="10.6.0"
|
||||
TRT_VER_FULL="${TRT_VER_BASE}.26"
|
||||
CUDA_VER="12.6"
|
||||
CUDNN_VER="9.5.0.50-1"
|
||||
|
@ -1,89 +0,0 @@
|
||||
//
|
||||
// Created by mfuntowicz on 6/30/24.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "backends/trtllm/include/ffi.h"
|
||||
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
|
||||
const std::string_view &engineFolder,
|
||||
const std::string_view &executorWorker
|
||||
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
|
||||
|
||||
|
||||
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
||||
rust::Slice<const uint32_t> tokens,
|
||||
uint32_t maxNewTokens,
|
||||
int32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed) {
|
||||
|
||||
// This will copy all the items from the initial slice
|
||||
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
|
||||
return TensorRtLlmBackend::Submit(
|
||||
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
|
||||
const auto responses = TensorRtLlmBackend::PullNewTokens();
|
||||
|
||||
auto steps = std::make_unique<std::vector<GenerationStep>>();
|
||||
steps->reserve(responses.size());
|
||||
|
||||
#ifndef NDEBUG
|
||||
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
|
||||
#endif
|
||||
|
||||
// Transform tle::Response to GenerationStep
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) {
|
||||
const auto result = r.getResult();
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return steps;
|
||||
}
|
||||
|
||||
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
||||
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
|
||||
SPDLOG_INFO("Creating TensorRT-LLM Backend");
|
||||
// Unconditionally call this to initialize and discover TRTLLM plugins
|
||||
InitializeBackend();
|
||||
|
||||
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
|
||||
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
|
||||
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
|
||||
}
|
@ -4,10 +4,11 @@ pub mod errors;
|
||||
mod looper;
|
||||
mod utils;
|
||||
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends")]
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
|
||||
mod ffi {
|
||||
/// Struct used as shared type between rust and C++ to represent the result
|
||||
/// of a single decoding iteration
|
||||
#[cxx_name = "generation_step_t"]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerationStep {
|
||||
request_id: u64,
|
||||
@ -19,9 +20,10 @@ mod ffi {
|
||||
}
|
||||
|
||||
unsafe extern "C++" {
|
||||
include!("backends/trtllm/src/ffi.cpp");
|
||||
include!("backends/trtllm/csrc/ffi.hpp");
|
||||
|
||||
/// Represent an instance of the underlying TensorRT-LLM backend
|
||||
#[cxx_name = "tensorrt_llm_backend_t"]
|
||||
type TensorRtLlmBackendImpl;
|
||||
|
||||
/// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
|
||||
@ -38,21 +40,18 @@ mod ffi {
|
||||
/// ```
|
||||
///
|
||||
/// ```
|
||||
#[rust_name = "create_tensorrt_llm_backend"]
|
||||
fn CreateTensorRtLlmBackend(
|
||||
fn create_backend_from_engine_folder(
|
||||
engine_folder: &str,
|
||||
executor_worker: &str,
|
||||
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
|
||||
|
||||
#[rust_name = "num_responses_ready"]
|
||||
fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
|
||||
#[rust_name = "submit"]
|
||||
fn Submit(
|
||||
fn submit(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
tokens: &[u32],
|
||||
max_new_tokens: u32,
|
||||
top_k: i32,
|
||||
top_k: u32,
|
||||
top_p: f32,
|
||||
temperature: f32,
|
||||
repetition_penalty: f32,
|
||||
@ -60,9 +59,10 @@ mod ffi {
|
||||
seed: u64,
|
||||
) -> Result<u64>;
|
||||
|
||||
#[rust_name = "pull_tokens"]
|
||||
fn PullTokens(
|
||||
fn pull_tokens(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
|
||||
|
||||
fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,13 @@
|
||||
use std::hint;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use cxx::UniquePtr;
|
||||
use hashbrown::HashMap;
|
||||
use std::hint;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use tokenizers::Tokenizer;
|
||||
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
|
||||
use tokio::sync::TryAcquireError;
|
||||
use tokio::task::{spawn_blocking, JoinHandle};
|
||||
use tokio::task::spawn_blocking;
|
||||
use tokio::time::Instant;
|
||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||
use tracing::{debug, error, warn};
|
||||
@ -22,7 +21,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
|
||||
use text_generation_router::{FinishReason, Token};
|
||||
|
||||
use crate::errors::TensorRtLlmBackendError;
|
||||
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::utils::first_line;
|
||||
|
||||
type InferResult<T> = Result<T, InferError>;
|
||||
@ -30,9 +29,10 @@ type InferResult<T> = Result<T, InferError>;
|
||||
/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
|
||||
struct GenerationContext {
|
||||
request: ValidGenerateRequest,
|
||||
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
tokens: Vec<u32>,
|
||||
start: Option<Instant>,
|
||||
queued: Instant,
|
||||
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
@ -58,31 +58,22 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
|
||||
struct DecodedTokenContext {
|
||||
token: DecodedToken,
|
||||
start: Option<Instant>,
|
||||
queued: Instant,
|
||||
channel: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
}
|
||||
|
||||
fn executor_status_looper(
|
||||
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
|
||||
max_inflight_requests: usize,
|
||||
mut waiting_requests: UnboundedReceiver<GenerationContext>,
|
||||
post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
|
||||
tokenizer: Tokenizer,
|
||||
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
|
||||
mut backlog: UnboundedReceiver<GenerationContext>,
|
||||
) {
|
||||
// Track the tuple (request_id, stream) for each request
|
||||
let mut in_flights =
|
||||
HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
|
||||
|
||||
// TODO: Does it need a spin-loop?
|
||||
'scheduler: loop {
|
||||
// Is there any request pending to be scheduled?
|
||||
let awaiting_requests = waiting_requests.len();
|
||||
let awaiting_requests = backlog.len();
|
||||
for _ in 0..awaiting_requests {
|
||||
// Retrieve all the requests
|
||||
if let Some(mut ctx) = waiting_requests.blocking_recv() {
|
||||
if let Some(ctx) = backlog.blocking_recv() {
|
||||
// Submit all the request to the executor and move the context to the in-flight tracker
|
||||
let request = &ctx.request;
|
||||
let generation_params = &request.parameters;
|
||||
@ -93,7 +84,7 @@ fn executor_status_looper(
|
||||
match backend.pin_mut().submit(
|
||||
&input_ids.unwrap(), // This is checked beforehand in validate()
|
||||
stopping_params.max_new_tokens,
|
||||
generation_params.top_k as i32,
|
||||
generation_params.top_k,
|
||||
generation_params.top_p,
|
||||
generation_params.temperature,
|
||||
generation_params.repetition_penalty,
|
||||
@ -103,7 +94,6 @@ fn executor_status_looper(
|
||||
Ok(request_id) => {
|
||||
// Insert the context linked to the generated request id in the tracker
|
||||
debug!("[in-flight] Added {}", request_id);
|
||||
ctx.start = Some(Instant::now());
|
||||
in_flights.insert(request_id, ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
@ -117,29 +107,43 @@ fn executor_status_looper(
|
||||
}
|
||||
}
|
||||
};
|
||||
} else {
|
||||
break 'scheduler;
|
||||
}
|
||||
}
|
||||
|
||||
if backend.num_responses_ready() > 0 {
|
||||
match backend.pin_mut().pull_tokens() {
|
||||
if backend.num_tokens_ready() > 0 {
|
||||
let mut backend = backend.pin_mut();
|
||||
match backend.as_mut().pull_tokens() {
|
||||
Ok(responses) => {
|
||||
// Iterate through all the decoded token
|
||||
for step in responses.deref() {
|
||||
if let Some(ctx) = in_flights.get(&step.request_id) {
|
||||
// Remove from tracked requests
|
||||
let parcel =
|
||||
DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
|
||||
token: dt,
|
||||
start: ctx.start,
|
||||
queued: ctx.queued,
|
||||
channel: ctx.streamer.clone(),
|
||||
});
|
||||
if let Some(ctx) = in_flights.get_mut(&step.request_id) {
|
||||
// Update the starting timestamp if not set
|
||||
// This value might not be the actual real starting time of the request
|
||||
// on the executor side - Need to expose more info from the executor to
|
||||
// retrieve this value
|
||||
// TODO : Expose actual real starting time for a request on FFI layer
|
||||
if ctx.start.is_none() {
|
||||
ctx.start = Some(Instant::now());
|
||||
}
|
||||
|
||||
// Submit the work to p:the post_processor
|
||||
let posted = post_processor_sender.send((step.request_id, parcel));
|
||||
// Try to map the generation step to a DecodedToken
|
||||
let response = match DecodedToken::try_from(step) {
|
||||
Ok(decoded_token) => {
|
||||
post_process_decoded_token(&tokenizer, ctx, decoded_token)
|
||||
}
|
||||
Err(err) => Err(err),
|
||||
};
|
||||
|
||||
if posted.is_err() || step.is_final {
|
||||
debug!("Removing {}", step.request_id);
|
||||
// Attempt to send back the response to the client
|
||||
if let Err(_) = ctx.streamer.send(response) {
|
||||
// Client has dropped, remove from tracked requests
|
||||
debug!(
|
||||
"Client dropped - removing request {} from tracked requests",
|
||||
step.request_id
|
||||
);
|
||||
backend.as_mut().cancel(step.request_id);
|
||||
let _ = in_flights.remove(&step.request_id);
|
||||
}
|
||||
} else {
|
||||
@ -159,54 +163,36 @@ fn executor_status_looper(
|
||||
}
|
||||
}
|
||||
|
||||
fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
|
||||
tokenizer: Tokenizer,
|
||||
max_inflight_requests: usize,
|
||||
mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
|
||||
) {
|
||||
let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
|
||||
|
||||
'post_processor: loop {
|
||||
if decoded_tokens.is_closed() {
|
||||
warn!("Post processor IPC is closed, loop will exit now.");
|
||||
break 'post_processor;
|
||||
}
|
||||
|
||||
if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
|
||||
match decoded {
|
||||
Ok(ctx) => {
|
||||
states
|
||||
.entry(request_id)
|
||||
.and_modify(|s| s.push(*&ctx.token.id))
|
||||
.or_insert_with(|| {
|
||||
let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
|
||||
state.push(*&ctx.token.id);
|
||||
state
|
||||
});
|
||||
|
||||
let out = match tokenizer.decode(&[ctx.token.id], false) {
|
||||
fn post_process_decoded_token(
|
||||
tokenizer: &Tokenizer,
|
||||
ctx: &mut GenerationContext,
|
||||
decoded_token: DecodedToken,
|
||||
) -> InferResult<InferStreamResponse> {
|
||||
match tokenizer.decode(&[decoded_token.id], false) {
|
||||
Ok(text) => {
|
||||
let is_special =
|
||||
tokenizer.get_added_vocabulary().is_special_token(&text);
|
||||
let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
|
||||
let token = Token {
|
||||
id: ctx.token.id,
|
||||
id: decoded_token.id,
|
||||
text,
|
||||
logprob: ctx.token.log_prob,
|
||||
logprob: decoded_token.log_prob,
|
||||
special: is_special,
|
||||
};
|
||||
|
||||
let out = if !ctx.token.is_final {
|
||||
// Append the token to the tracked generated tokens
|
||||
ctx.tokens.push(token.id);
|
||||
|
||||
// Map the correct response depending on the step is final or not
|
||||
let out = if !decoded_token.is_final {
|
||||
InferStreamResponse::Intermediate {
|
||||
token,
|
||||
top_tokens: vec![],
|
||||
}
|
||||
} else {
|
||||
let tokens = states.remove(&request_id).unwrap();
|
||||
let text = tokenizer.decode(&tokens, true);
|
||||
let text = tokenizer.decode(&ctx.tokens, true);
|
||||
let generated_text = GeneratedText {
|
||||
text: text.unwrap(),
|
||||
generated_tokens: tokens.len() as u32,
|
||||
finish_reason: FinishReason::EndOfSequenceToken,
|
||||
generated_tokens: ctx.tokens.len() as u32,
|
||||
finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
|
||||
seed: None,
|
||||
};
|
||||
|
||||
@ -222,17 +208,6 @@ fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
|
||||
Ok(out)
|
||||
}
|
||||
Err(err) => Err(GenerationError(err.to_string())),
|
||||
};
|
||||
|
||||
if let Err(_) = ctx.channel.send(out) {
|
||||
warn!("Failed to send decoded token back to the user")
|
||||
}
|
||||
}
|
||||
Err(_err) => {
|
||||
todo!("what do we do?")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -277,11 +252,7 @@ fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
|
||||
|
||||
unsafe impl Send for TensorRtLlmBackendImpl {}
|
||||
|
||||
pub struct TensorRtLlmBackendV2 {
|
||||
executor_looper: JoinHandle<()>,
|
||||
post_processor_looper: JoinHandle<()>,
|
||||
executor: UnboundedSender<GenerationContext>,
|
||||
}
|
||||
pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
|
||||
|
||||
impl TensorRtLlmBackendV2 {
|
||||
pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
|
||||
@ -295,32 +266,17 @@ impl TensorRtLlmBackendV2 {
|
||||
|
||||
// Allocate the IPC layer to communicate with the backend
|
||||
let (executor_sender, executor_receiver) = unbounded_channel();
|
||||
let (post_processor_sender, post_processor_receiver) = unbounded_channel();
|
||||
|
||||
// Create the FFI backend
|
||||
let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
|
||||
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
|
||||
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
|
||||
|
||||
// Executor looper is responsible for scheduling and pulling requests state at regular interval
|
||||
let executor_looper = spawn_blocking(move || {
|
||||
executor_status_looper(
|
||||
backend,
|
||||
max_inflight_requests,
|
||||
executor_receiver,
|
||||
post_processor_sender,
|
||||
)
|
||||
spawn_blocking(move || {
|
||||
executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
|
||||
});
|
||||
|
||||
// Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
|
||||
let post_processor_looper = spawn_blocking(move || {
|
||||
post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
|
||||
});
|
||||
|
||||
Ok(TensorRtLlmBackendV2 {
|
||||
executor_looper,
|
||||
post_processor_looper,
|
||||
executor: executor_sender,
|
||||
})
|
||||
Ok(TensorRtLlmBackendV2(executor_sender))
|
||||
}
|
||||
|
||||
fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
|
||||
@ -354,20 +310,21 @@ impl TensorRtLlmBackendV2 {
|
||||
impl Backend for TensorRtLlmBackendV2 {
|
||||
fn schedule(
|
||||
&self,
|
||||
inner: ValidGenerateRequest,
|
||||
request: ValidGenerateRequest,
|
||||
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
|
||||
Self::validate(&inner)?;
|
||||
Self::validate(&request)?;
|
||||
|
||||
// Open-up the stream to send tokens
|
||||
let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
|
||||
|
||||
// Send the context to the executor for scheduling
|
||||
let queued = Instant::now();
|
||||
match self.executor.send(GenerationContext {
|
||||
request: inner,
|
||||
match self.0.send(GenerationContext {
|
||||
request,
|
||||
streamer,
|
||||
tokens: Vec::with_capacity(256),
|
||||
start: None,
|
||||
queued,
|
||||
streamer,
|
||||
}) {
|
||||
Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
|
||||
Err(_) => Err(GenerationError(
|
||||
@ -377,6 +334,6 @@ impl Backend for TensorRtLlmBackendV2 {
|
||||
}
|
||||
|
||||
async fn health(&self, _: bool) -> bool {
|
||||
!self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
|
||||
true
|
||||
}
|
||||
}
|
||||
|
@ -3,14 +3,15 @@ use std::path::{Path, PathBuf};
|
||||
use clap::Parser;
|
||||
use hf_hub::api::tokio::{Api, ApiBuilder};
|
||||
use hf_hub::{Cache, Repo, RepoType};
|
||||
use tokenizers::Tokenizer;
|
||||
use tracing::info;
|
||||
|
||||
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
|
||||
use text_generation_backends_trtllm::TensorRtLlmBackendV2;
|
||||
use text_generation_router::server::get_base_tokenizer;
|
||||
use text_generation_router::server::{
|
||||
get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
|
||||
};
|
||||
use text_generation_router::usage_stats::UsageStatsLevel;
|
||||
use text_generation_router::{server, HubTokenizerConfig};
|
||||
use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
|
||||
|
||||
/// App Configuration
|
||||
#[derive(Parser, Debug)]
|
||||
@ -61,7 +62,7 @@ struct Args {
|
||||
#[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
|
||||
executor_worker: PathBuf,
|
||||
#[clap(default_value = "on", long, env)]
|
||||
usage_stats: usage_stats::UsageStatsLevel,
|
||||
usage_stats: UsageStatsLevel,
|
||||
#[clap(default_value = "2000000", long, env)]
|
||||
payload_limit: usize,
|
||||
}
|
||||
@ -126,18 +127,18 @@ async fn get_tokenizer(
|
||||
|
||||
// Load tokenizer and model info
|
||||
let (
|
||||
tokenizer_filename,
|
||||
_config_filename,
|
||||
tokenizer_config_filename,
|
||||
config_filename,
|
||||
_tokenizer_config_filename,
|
||||
_preprocessor_config_filename,
|
||||
_processor_config_filename,
|
||||
_model_info,
|
||||
) = match api {
|
||||
Type::None => (
|
||||
Some(local_path.join("tokenizer.json")),
|
||||
Some(local_path.join("config.json")),
|
||||
Some(local_path.join("tokenizer_config.json")),
|
||||
Some(local_path.join("preprocessor_config.json")),
|
||||
Some(local_path.join("processor_config.json")),
|
||||
None,
|
||||
),
|
||||
Type::Api(api) => {
|
||||
let api_repo = api.repo(Repo::with_revision(
|
||||
@ -146,21 +147,23 @@ async fn get_tokenizer(
|
||||
revision.unwrap_or_else(|| "main").to_string(),
|
||||
));
|
||||
|
||||
let tokenizer_filename = match api_repo.get("tokenizer.json").await {
|
||||
Ok(tokenizer_filename) => Some(tokenizer_filename),
|
||||
Err(_) => get_base_tokenizer(&api, &api_repo).await,
|
||||
};
|
||||
let config_filename = api_repo.get("config.json").await.ok();
|
||||
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
|
||||
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
|
||||
let processor_config_filename = api_repo.get("processor_config.json").await.ok();
|
||||
|
||||
let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
|
||||
Some(model_info)
|
||||
} else {
|
||||
tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
|
||||
None
|
||||
};
|
||||
(
|
||||
tokenizer_filename,
|
||||
config_filename,
|
||||
tokenizer_config_filename,
|
||||
preprocessor_config_filename,
|
||||
processor_config_filename,
|
||||
model_info,
|
||||
)
|
||||
}
|
||||
Type::Cache(cache) => {
|
||||
@ -170,24 +173,55 @@ async fn get_tokenizer(
|
||||
revision.clone().unwrap_or_else(|| "main").to_string(),
|
||||
));
|
||||
(
|
||||
repo.get("tokenizer.json"),
|
||||
repo.get("config.json"),
|
||||
repo.get("tokenizer_config.json"),
|
||||
repo.get("preprocessor_config.json"),
|
||||
repo.get("processor_config.json"),
|
||||
None,
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
// Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
|
||||
let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
|
||||
{
|
||||
HubTokenizerConfig::from_file(filename)
|
||||
// let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
|
||||
// {
|
||||
// HubTokenizerConfig::from_file(filename)
|
||||
// } else {
|
||||
// tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
|
||||
// };
|
||||
|
||||
// let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
|
||||
// tracing::warn!("Could not find tokenizer config locally and no API specified");
|
||||
// HubTokenizerConfig::default()
|
||||
// });
|
||||
|
||||
let tokenizer: Tokenizer = {
|
||||
use pyo3::prelude::*;
|
||||
pyo3::Python::with_gil(|py| -> PyResult<()> {
|
||||
py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
|
||||
Ok(())
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
tracing::error!("Failed to import python tokenizer {err}");
|
||||
})
|
||||
.or_else(|err| {
|
||||
let out = legacy_tokenizer_handle(config_filename.as_ref());
|
||||
out.ok_or(err)
|
||||
})
|
||||
.expect("We cannot load a tokenizer");
|
||||
let filename = "out/tokenizer.json";
|
||||
if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
|
||||
Tokenizer::Rust(tok)
|
||||
} else {
|
||||
tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
|
||||
Tokenizer::Python {
|
||||
tokenizer_name: tokenizer_name.to_string(),
|
||||
revision: revision.map(|revision| revision.to_string()),
|
||||
trust_remote_code: false,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
|
||||
Some(tokenizer)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@ -258,14 +292,18 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
||||
}
|
||||
|
||||
// Create the backend
|
||||
let tokenizer = get_tokenizer(
|
||||
match get_tokenizer(
|
||||
&tokenizer_name,
|
||||
tokenizer_config_path.as_deref(),
|
||||
revision.as_deref(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to retrieve tokenizer implementation");
|
||||
|
||||
.expect("Failed to retrieve tokenizer implementation")
|
||||
{
|
||||
Tokenizer::Python { .. } => Err(TensorRtLlmBackendError::Tokenizer(
|
||||
"Failed to retrieve Rust based tokenizer".to_string(),
|
||||
)),
|
||||
Tokenizer::Rust(tokenizer) => {
|
||||
info!("Successfully retrieved tokenizer {}", &tokenizer_name);
|
||||
let backend = TensorRtLlmBackendV2::new(
|
||||
tokenizer,
|
||||
@ -305,3 +343,5 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +0,0 @@
|
||||
//
|
||||
// Created by mfuntowicz on 7/2/24.
|
||||
//
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "../include/backend.h"
|
||||
|
||||
TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
|
||||
const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
|
||||
const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
|
||||
|
||||
spdlog::info("Loading config from: {}", absolute(engines).string());
|
||||
huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
|
||||
}
|
152
backends/trtllm/tests/test_backend.cpp
Normal file
152
backends/trtllm/tests/test_backend.cpp
Normal file
@ -0,0 +1,152 @@
|
||||
//
|
||||
// Created by mfuntowicz on 12/3/24.
|
||||
//
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
#include "backend.hpp"
|
||||
|
||||
|
||||
|
||||
using namespace huggingface::tgi::backends::trtllm;
|
||||
|
||||
TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(0.95, 1e-6));
|
||||
|
||||
// Stop words
|
||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||
|
||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
||||
{
|
||||
// Currently we do not support multi-tokens stop words
|
||||
REQUIRE(lhs.size() == 1);
|
||||
REQUIRE(rhs.size() == 1);
|
||||
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("parse generation_config.json default", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"eos_token_id", {1,2,3}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||
|
||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
||||
{
|
||||
// Currently we do not support multi-tokens stop words
|
||||
REQUIRE(lhs.size() == 1);
|
||||
REQUIRE(rhs.size() == 1);
|
||||
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("parse generation_config.json empty", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"eos_token_id", {}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE(generation_config.stop_words.empty());
|
||||
|
||||
const json config_j2 = {};
|
||||
const auto generation_config2 = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config2.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config2.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE(generation_config2.stop_words.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("parallel_config single", "[backend_workspace_t]")
|
||||
{
|
||||
// Generate temporary folder
|
||||
const auto tmp_p = std::filesystem::temp_directory_path();
|
||||
const auto config_p = tmp_p / "config.json";
|
||||
const auto generation_config_p = tmp_p / "generation_config.json";
|
||||
|
||||
// Generate content
|
||||
std::ofstream o_config(config_p);
|
||||
o_config << R"({"pretrained_config": {"mapping": {"world_size": 2}}})"_json;
|
||||
o_config.close();
|
||||
|
||||
std::ofstream o_generation_config(generation_config_p);
|
||||
o_generation_config << R"({"eos_token_id": []})"_json;
|
||||
o_generation_config.close();
|
||||
|
||||
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
|
||||
const auto parallel = workspace.parallel_config();
|
||||
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kORCHESTRATOR);
|
||||
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
|
||||
|
||||
std::filesystem::remove(config_p);
|
||||
std::filesystem::remove(generation_config_p);
|
||||
}
|
||||
|
||||
TEST_CASE("parallel_config multi", "[backend_workspace_t]")
|
||||
{
|
||||
// Generate temporary folder
|
||||
const auto tmp_p = std::filesystem::temp_directory_path();
|
||||
const auto config_p = tmp_p / "config.json";
|
||||
const auto generation_config_p = tmp_p / "generation_config.json";
|
||||
|
||||
// Generate content
|
||||
std::ofstream o_config(config_p);
|
||||
o_config << R"({"pretrained_config": {"mapping": {"world_size": 1}}})"_json;
|
||||
o_config.close();
|
||||
|
||||
std::ofstream o_generation_config(generation_config_p);
|
||||
o_generation_config << R"({"eos_token_id": []})"_json;
|
||||
o_generation_config.close();
|
||||
|
||||
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
|
||||
const auto parallel = workspace.parallel_config();
|
||||
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kLEADER);
|
||||
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
|
||||
|
||||
std::filesystem::remove(config_p);
|
||||
std::filesystem::remove(generation_config_p);
|
||||
}
|
||||
|
||||
TEST_CASE("executor_config", "[backend_workspace_t]")
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
|
||||
{
|
||||
const sampling_params_t params = {40, 0.95, 0.9, 1.0, 0.6, 2014};
|
||||
const auto config = static_cast<tle::SamplingConfig>(params);
|
||||
|
||||
REQUIRE(config.getTopK().has_value());
|
||||
REQUIRE(config.getTopK().value() == params.top_k);
|
||||
|
||||
REQUIRE(config.getSeed().has_value());
|
||||
REQUIRE(config.getSeed().value() == params.seed);
|
||||
|
||||
REQUIRE(config.getTopP().has_value());
|
||||
REQUIRE_THAT(*config.getTopP(), Catch::Matchers::WithinAbs(params.top_p, 1e-6f));
|
||||
|
||||
REQUIRE(config.getRepetitionPenalty().has_value());
|
||||
REQUIRE_THAT(*config.getRepetitionPenalty(), Catch::Matchers::WithinAbs(params.repetition_penalty, 1e-6f));
|
||||
|
||||
REQUIRE(config.getFrequencyPenalty().has_value());
|
||||
REQUIRE_THAT(*config.getFrequencyPenalty(), Catch::Matchers::WithinAbs(params.frequency_penalty, 1e-6f));
|
||||
|
||||
REQUIRE(config.getTemperature().has_value());
|
||||
REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
|
||||
}
|
82
backends/trtllm/tests/test_hardware.cpp
Normal file
82
backends/trtllm/tests/test_hardware.cpp
Normal file
@ -0,0 +1,82 @@
|
||||
//
|
||||
// Created by mfuntowicz on 11/16/24.
|
||||
//
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include "../csrc/hardware.hpp"
|
||||
|
||||
using namespace huggingface::tgi::hardware::cuda;
|
||||
|
||||
TEST_CASE("is_at_least_<arch>") {
|
||||
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
|
||||
REQUIRE(VOLTA_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto HOPPER_CAPABILITIES = compute_capabilities_t(9, 0);
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_hopper());
|
||||
}
|
||||
|
||||
TEST_CASE("is_at_least") {
|
||||
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
|
||||
REQUIRE(VOLTA_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto HOPPER_CAPABILITIES = compute_capabilities_t (9, 0);
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
|
||||
}
|
@ -104,6 +104,10 @@ impl Backend for BackendV2 {
|
||||
}
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
fn start_health(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Batching logic
|
||||
|
@ -111,6 +111,10 @@ impl Backend for BackendV3 {
|
||||
}
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
fn start_health(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Batching logic
|
||||
|
@ -217,8 +217,8 @@ impl Health for ShardedClient {
|
||||
input_chunks: Some(Input {
|
||||
chunks: vec![Chunk::Text("liveness".into()).into()],
|
||||
}),
|
||||
truncate: 10,
|
||||
add_special_tokens: true,
|
||||
truncate: 1,
|
||||
add_special_tokens: false,
|
||||
prefill_logprobs: false,
|
||||
parameters: Some(NextTokenChooserParameters {
|
||||
temperature: 1.0,
|
||||
@ -241,7 +241,7 @@ impl Health for ShardedClient {
|
||||
top_n_tokens: 0,
|
||||
// Block 0 is reserved for health checks
|
||||
blocks: vec![0],
|
||||
slots: (0..16).collect(),
|
||||
slots: vec![0],
|
||||
cache_len: 0,
|
||||
adapter_id: None,
|
||||
chunk_len: None,
|
||||
|
@ -10,7 +10,7 @@
|
||||
"name": "Apache 2.0",
|
||||
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
||||
},
|
||||
"version": "3.0.1-dev0"
|
||||
"version": "3.0.2-dev0"
|
||||
},
|
||||
"paths": {
|
||||
"/": {
|
||||
|
@ -17,6 +17,8 @@
|
||||
title: Using TGI with Intel GPUs
|
||||
- local: installation
|
||||
title: Installation from source
|
||||
- local: multi_backend_support
|
||||
title: Multi-backend support
|
||||
|
||||
- local: architecture
|
||||
title: Internal Architecture
|
||||
@ -45,6 +47,10 @@
|
||||
- local: basic_tutorials/train_medusa
|
||||
title: Train Medusa
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- local: backends/trtllm
|
||||
title: TensorRT-LLM
|
||||
title: Backends
|
||||
- sections:
|
||||
- local: reference/launcher
|
||||
title: All TGI CLI options
|
||||
|
@ -9,8 +9,10 @@ A high-level architecture diagram can be seen here:
|
||||
This diagram shows well there are these separate components:
|
||||
|
||||
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
|
||||
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
|
||||
- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
|
||||
- **The model server**, responsible for receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
|
||||
|
||||
Note that for other backends (eg. TRTLLM) the model server and launcher are specific to the backend.
|
||||
|
||||
The router and the model server can be two different machines, they do not need to be deployed together.
|
||||
|
||||
|
81
docs/source/backends/trtllm.md
Normal file
81
docs/source/backends/trtllm.md
Normal file
@ -0,0 +1,81 @@
|
||||
# TensorRT-LLM backend
|
||||
|
||||
The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
|
||||
that uses NVIDIA's TensorRT library for inference acceleration.
|
||||
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
|
||||
|
||||
To use the TRTLLM backend you need to compile `engines` for the models you want to use.
|
||||
Each `engine` must be compiled on the same GPU architecture that you will use for inference.
|
||||
|
||||
## Supported models
|
||||
|
||||
Check the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) to see which models are
|
||||
supported.
|
||||
|
||||
## Compiling engines
|
||||
|
||||
You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
|
||||
want to use.
|
||||
|
||||
```bash
|
||||
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
# Install huggingface_cli
|
||||
python -m pip install huggingface-cli[hf_transfer]
|
||||
|
||||
# Login to the Hugging Face Hub
|
||||
huggingface-cli login
|
||||
|
||||
# Create a directory to store the model
|
||||
mkdir -p /tmp/models/$MODEL_NAME
|
||||
|
||||
# Create a directory to store the compiled engine
|
||||
mkdir -p /tmp/engines/$MODEL_NAME
|
||||
|
||||
# Download the model
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
|
||||
|
||||
# Compile the engine using Optimum-NVIDIA
|
||||
docker run \
|
||||
--rm \
|
||||
-it \
|
||||
--gpus=1 \
|
||||
-v /tmp/models/$MODEL_NAME:/model \
|
||||
-v /tmp/engines/$MODEL_NAME:/engine \
|
||||
huggingface/optimum-nvidia \
|
||||
optimum-cli export trtllm \
|
||||
--tp=1 \
|
||||
--pp=1 \
|
||||
--max-batch-size=128 \
|
||||
--max-input-length 4096 \
|
||||
--max-output-length 8192 \
|
||||
--max-beams-width=1 \
|
||||
--destination /engine \
|
||||
$MODEL_NAME
|
||||
```
|
||||
|
||||
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
|
||||
|
||||
## Using the TRTLLM backend
|
||||
|
||||
Run TGI-TRTLLM Docker image with the compiled engine:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
--gpus 1 \
|
||||
-it \
|
||||
--rm \
|
||||
-p 3000:3000 \
|
||||
-e MODEL=$MODEL_NAME \
|
||||
-e PORT=3000 \
|
||||
-e HF_TOKEN='hf_XXX' \
|
||||
-v /tmp/engines/$MODEL_NAME:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
|
||||
--executor-worker executorWorker \
|
||||
--model-id /data/$MODEL_NAME
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
|
||||
`.devcontainer` directory.
|
@ -19,6 +19,6 @@ docker run --gpus all \
|
||||
--shm-size 1g \
|
||||
-e HF_TOKEN=$token \
|
||||
-p 8080:80 \
|
||||
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 \
|
||||
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 \
|
||||
--model-id $model
|
||||
```
|
||||
|
@ -80,7 +80,7 @@ Raw results
|
||||
|
||||
| | | | | |
|
||||
|---|---|---|---|---|
|
||||
|2nd run ||**TGI v3**|**vLLM**|**Amount of req**|
|
||||
|2nd run ||**TGI v3** (time in s)|**vLLM** (s)|**Amount of req**|
|
||||
|**Llama 3.1 8b**|Small test - L4 - 8B|17.5|19.9|200|
|
||||
|**Llama 3.1 8b**|Long test* - L4 - 8B|53|57|10|
|
||||
|**Llama 3.1 8b**|Small test - 4xL4 - 8B|4.8|6|200|
|
||||
@ -88,7 +88,7 @@ Raw results
|
||||
|**Llama 3.1 70b**|Small test - 8XH100 - 70B|6.2|7.4|200|
|
||||
|**Llama 3.1 70b**|Long test - 8H100 - 70B|2|27.5|20|
|
||||
||||||
|
||||
|1st run ||TGI|vLLM|Amount of req|
|
||||
|1st run ||TGI (s)|vLLM (s)|Amount of req|
|
||||
|**Llama 3.1 8b**|Small test - L4|19.9|19.9|200|
|
||||
|**Llama 3.1 8b**|Long test (10) - L4|49.8|55|10|
|
||||
|**Llama 3.1 8b**|Small test - 4xL4|13|12.6|200|
|
||||
|
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
|
||||
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
|
||||
|
||||
```bash
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model --quantize bitsandbytes
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes
|
||||
```
|
||||
|
||||
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
|
||||
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
|
||||
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
|
||||
|
||||
```bash
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model --quantize bitsandbytes-nf4
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes-nf4
|
||||
```
|
||||
|
||||
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
|
||||
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
|
||||
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
|
||||
|
||||
```bash
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model --quantize gptq
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize gptq
|
||||
```
|
||||
|
||||
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
|
||||
|
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
|
||||
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
|
||||
--device=/dev/kfd --device=/dev/dri --group-add video \
|
||||
--ipc=host --shm-size 256g --net host -v $volume:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.0-rocm \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.1-rocm \
|
||||
--model-id $model
|
||||
```
|
||||
|
||||
|
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
|
||||
docker run --rm --privileged --cap-add=sys_nice \
|
||||
--device=/dev/dri \
|
||||
--ipc=host --shm-size 1g --net host -v $volume:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.0-intel-xpu \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.1-intel-xpu \
|
||||
--model-id $model --cuda-graphs 0
|
||||
```
|
||||
|
||||
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
|
||||
docker run --rm --privileged --cap-add=sys_nice \
|
||||
--device=/dev/dri \
|
||||
--ipc=host --shm-size 1g --net host -v $volume:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.0-intel-cpu \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.1-intel-cpu \
|
||||
--model-id $model --cuda-graphs 0
|
||||
```
|
||||
|
||||
|
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.0 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.1 \
|
||||
--model-id $model
|
||||
```
|
||||
|
||||
|
13
docs/source/multi_backend_support.md
Normal file
13
docs/source/multi_backend_support.md
Normal file
@ -0,0 +1,13 @@
|
||||
# Multi-backend support
|
||||
|
||||
TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
|
||||
With multi-backend support, you can choose the backend that best suits your needs,
|
||||
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
|
||||
TGI remains consistent across backends, allowing you to switch between them seamlessly.
|
||||
|
||||
**Supported backends:**
|
||||
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
|
||||
within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
|
||||
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
|
||||
It utilizes specialized optimizations and custom kernels for enhanced performance.
|
||||
However, it requires a model-specific compilation step for each GPU architecture.
|
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.0 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.0.1 \
|
||||
--model-id $model
|
||||
```
|
||||
|
||||
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
|
||||
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
|
||||
|
||||
```bash
|
||||
docker run ghcr.io/huggingface/text-generation-inference:3.0.0 --help
|
||||
docker run ghcr.io/huggingface/text-generation-inference:3.0.1 --help
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
@ -163,7 +163,7 @@ hub = {
|
||||
|
||||
# create Hugging Face Model Class
|
||||
huggingface_model = HuggingFaceModel(
|
||||
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.0"),
|
||||
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.1"),
|
||||
env=hub,
|
||||
role=role,
|
||||
)
|
||||
|
@ -33,6 +33,13 @@ pub trait Backend {
|
||||
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError>;
|
||||
|
||||
async fn health(&self, current_health: bool) -> bool;
|
||||
|
||||
/// The state of the health on startup
|
||||
/// Typically false, or true if the backend includes
|
||||
/// a warmup phase.
|
||||
fn start_health(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Inference struct
|
||||
@ -75,7 +82,7 @@ impl Infer {
|
||||
let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
|
||||
|
||||
// Backend health
|
||||
let backend_health = Arc::new(AtomicBool::new(false));
|
||||
let backend_health = Arc::new(AtomicBool::new(backend.start_health()));
|
||||
|
||||
Self {
|
||||
validation,
|
||||
|
@ -459,7 +459,7 @@ pub struct CompletionRequest {
|
||||
pub prompt: Prompt,
|
||||
|
||||
/// The maximum number of tokens that can be generated in the chat completion.
|
||||
#[serde(default)]
|
||||
#[serde(default, alias = "max_completion_tokens")]
|
||||
#[schema(default = "1024", example = "32")]
|
||||
pub max_tokens: Option<u32>,
|
||||
|
||||
|
@ -1593,7 +1593,7 @@ pub fn schema() -> ApiDoc {
|
||||
ApiDoc
|
||||
}
|
||||
|
||||
fn py_resolve_tokenizer(
|
||||
pub fn py_resolve_tokenizer(
|
||||
py: pyo3::Python,
|
||||
tokenizer_name: &str,
|
||||
revision: Option<&str>,
|
||||
@ -1619,7 +1619,7 @@ fn py_resolve_tokenizer(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
|
||||
pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
|
||||
// XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
|
||||
// and state-spaces/mamba-130m
|
||||
tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");
|
||||
|
@ -1,4 +1,4 @@
|
||||
commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
|
||||
commit_rocm := de990cd12537f78f74e40b5c8ee1a62d63d734dd
|
||||
|
||||
build-vllm-rocm:
|
||||
if [ ! -d 'vllm' ]; then \
|
||||
|
@ -219,7 +219,9 @@ def paged_reshape_and_cache(
|
||||
raise ImportError(
|
||||
f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
|
||||
)
|
||||
ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
|
||||
ops.reshape_and_cache(
|
||||
key, value, key_cache, value_cache, slots, "auto", 1.0, 1.0
|
||||
)
|
||||
elif SYSTEM == "ipex":
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
|
@ -6,26 +6,42 @@ from text_generation_server.utils.import_utils import SYSTEM
|
||||
from text_generation_server.layers.attention import Seqlen
|
||||
from text_generation_server.utils.log import log_master
|
||||
from loguru import logger
|
||||
import vllm._custom_ops as ops
|
||||
|
||||
major, minor = torch.cuda.get_device_capability()
|
||||
is_sm75 = major == 7 and minor == 5
|
||||
|
||||
_PARTITION_SIZE_V1V2 = 512
|
||||
_PARTITION_SIZE_V1V2 = 1024
|
||||
_PARTITION_SIZE_CUSTOM = 256
|
||||
|
||||
_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
_ON_MI250_MI300 = any(
|
||||
arch in _GPU_ARCH for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"]
|
||||
)
|
||||
|
||||
use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
|
||||
ENGINE = "triton" if use_triton else "ck"
|
||||
|
||||
use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
|
||||
try:
|
||||
if use_rocm_custom_paged_attn:
|
||||
from vllm._custom_C import paged_attention_custom
|
||||
except ImportError as e:
|
||||
log_master(
|
||||
logger.info,
|
||||
f"Custom Paged Attention not available. Complete error: {e}",
|
||||
|
||||
|
||||
def _use_rocm_custom_paged_attention(
|
||||
qtype: torch.dtype,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
gqa_ratio: int,
|
||||
max_seq_len: int,
|
||||
) -> bool:
|
||||
# rocm custom page attention not support on navi (gfx1*)
|
||||
return (
|
||||
use_rocm_custom_paged_attn
|
||||
and _ON_MI250_MI300
|
||||
and (qtype == torch.half or qtype == torch.bfloat16)
|
||||
and (head_size == 64 or head_size == 128)
|
||||
and (block_size == 16 or block_size == 32)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16)
|
||||
and max_seq_len <= 131072
|
||||
)
|
||||
use_rocm_custom_paged_attn = False
|
||||
|
||||
|
||||
def paged_attention(
|
||||
@ -66,13 +82,8 @@ def paged_attention(
|
||||
|
||||
num_kv_heads = kv_cache.key.shape[1]
|
||||
gqa_ratio = num_heads // num_kv_heads
|
||||
use_custom = (
|
||||
use_rocm_custom_paged_attn
|
||||
and (query.dtype == torch.half or query.dtype == torch.bfloat16)
|
||||
and (head_size == 128 or head_size == 64)
|
||||
and (block_size == 16 or block_size == 32)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16)
|
||||
and max_s <= 32768
|
||||
use_custom = _use_rocm_custom_paged_attention(
|
||||
query.dtype, head_size, block_size, gqa_ratio, max_s
|
||||
)
|
||||
|
||||
if not use_custom:
|
||||
@ -90,8 +101,6 @@ def paged_attention(
|
||||
# V1 to avoid the overhead of reduction. Also, if the number of
|
||||
# sequences or heads is large, we use V1 since there is enough work
|
||||
# to parallelize.
|
||||
import vllm._custom_ops as ops
|
||||
|
||||
use_v1 = (
|
||||
max_s <= 8192
|
||||
and (max_num_partitions == 1 or num_seqs * num_heads > 512)
|
||||
@ -103,7 +112,7 @@ def paged_attention(
|
||||
query,
|
||||
kv_cache.key,
|
||||
kv_cache.value,
|
||||
kv_head_mapping,
|
||||
num_kv_heads,
|
||||
softmax_scale,
|
||||
block_tables,
|
||||
input_lengths,
|
||||
@ -112,6 +121,7 @@ def paged_attention(
|
||||
None,
|
||||
"auto",
|
||||
1.0,
|
||||
1.0,
|
||||
)
|
||||
else:
|
||||
# Run PagedAttention V2.
|
||||
@ -137,7 +147,7 @@ def paged_attention(
|
||||
query,
|
||||
kv_cache.key,
|
||||
kv_cache.value,
|
||||
kv_head_mapping,
|
||||
num_kv_heads,
|
||||
softmax_scale,
|
||||
block_tables,
|
||||
input_lengths,
|
||||
@ -146,9 +156,10 @@ def paged_attention(
|
||||
None,
|
||||
"auto",
|
||||
1.0,
|
||||
1.0,
|
||||
)
|
||||
else:
|
||||
paged_attention_custom(
|
||||
ops.paged_attention_rocm(
|
||||
out,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
@ -164,6 +175,10 @@ def paged_attention(
|
||||
max_s,
|
||||
None,
|
||||
"auto",
|
||||
1.0,
|
||||
1.0,
|
||||
None,
|
||||
_PARTITION_SIZE,
|
||||
)
|
||||
|
||||
return out
|
||||
|
@ -72,7 +72,7 @@ if SYSTEM == "cuda":
|
||||
return normed_hidden_states, residual
|
||||
|
||||
elif SYSTEM == "rocm":
|
||||
from vllm._C import ops
|
||||
import vllm._custom_ops as ops
|
||||
|
||||
class FastLayerNorm(nn.LayerNorm):
|
||||
def forward(self, hidden_states, residual=None):
|
||||
@ -121,6 +121,27 @@ class FastRMSNorm(nn.Module):
|
||||
residual is not None,
|
||||
)
|
||||
return out, residual if residual is not None else hidden_states
|
||||
elif SYSTEM == "rocm":
|
||||
# We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
|
||||
if residual is not None:
|
||||
ops.fused_add_rms_norm(
|
||||
hidden_states,
|
||||
residual,
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return hidden_states, residual
|
||||
|
||||
residual = hidden_states
|
||||
|
||||
out = torch.empty_like(hidden_states)
|
||||
ops.rms_norm(
|
||||
out,
|
||||
hidden_states,
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return out, residual
|
||||
elif hidden_states.shape[-1] > 8192:
|
||||
if residual is not None:
|
||||
hidden_states += residual
|
||||
@ -164,20 +185,6 @@ class FastRMSNorm(nn.Module):
|
||||
res = hidden_states
|
||||
|
||||
return normed_hidden_states, res
|
||||
elif SYSTEM == "rocm":
|
||||
# We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
|
||||
if residual is not None:
|
||||
hidden_states += residual
|
||||
residual = hidden_states
|
||||
|
||||
out = torch.empty_like(hidden_states)
|
||||
ops.rms_norm(
|
||||
out,
|
||||
hidden_states,
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return out, residual
|
||||
else:
|
||||
raise ValueError(
|
||||
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
||||
|
@ -11,10 +11,10 @@ if SYSTEM == "rocm":
|
||||
|
||||
if ROCM_USE_SKINNY_GEMM:
|
||||
try:
|
||||
from vllm import _custom_C
|
||||
import vllm._custom_ops as ops
|
||||
except Exception as e:
|
||||
raise ImportError(
|
||||
f"Could not load `vllm._custom_C` for ROCm skinny gemm. Full error: {e}"
|
||||
f"Could not load `vllm._custom_ops` for ROCm skinny gemm. Full error: {e}"
|
||||
)
|
||||
|
||||
|
||||
@ -95,12 +95,12 @@ class FastLinearROCm(torch.nn.Module):
|
||||
out = torch.empty(
|
||||
inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
|
||||
)
|
||||
_custom_C.wvSpltK(weight, inp, out, n, self.cu_count)
|
||||
ops.wvSpltK(weight, inp, out, n, self.cu_count)
|
||||
elif m % 4 == 0 and n == 1 and k <= 8192:
|
||||
out = torch.empty(
|
||||
inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
|
||||
)
|
||||
_custom_C.LLMM1(weight, inp, out, 4)
|
||||
ops.LLMM1(weight, inp, out, 4)
|
||||
else:
|
||||
out = F.linear(inp, weight)
|
||||
|
||||
|
@ -24,10 +24,7 @@ from text_generation_server.utils.weights import (
|
||||
UnquantizedWeight,
|
||||
)
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
from .fused_moe_rocm import grouped_topk
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
elif SYSTEM == "ipex":
|
||||
if SYSTEM == "ipex":
|
||||
from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
|
||||
else:
|
||||
from moe_kernels.fused_moe import fused_topk, grouped_topk
|
||||
|
@ -1,52 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
|
||||
# TODO: Remove the functions once moe_kernel are built for ROCM
|
||||
def grouped_topk(
|
||||
hidden_states: torch.Tensor,
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
num_expert_group: int = 0,
|
||||
topk_group: int = 0,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
scores = torch.softmax(gating_output, dim=-1)
|
||||
num_token = scores.shape[0]
|
||||
group_scores = (
|
||||
scores.view(num_token, num_expert_group, -1).max(dim=-1).values
|
||||
) # [n, n_group]
|
||||
group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
|
||||
1
|
||||
] # [n, top_k_group]
|
||||
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
||||
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
||||
score_mask = (
|
||||
group_mask.unsqueeze(-1)
|
||||
.expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
|
||||
.reshape(num_token, -1)
|
||||
) # [n, e]
|
||||
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
||||
topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
return topk_weights, topk_ids
|
@ -6,9 +6,7 @@ import torch.nn as nn
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
from text_generation_server.utils.weights import UnquantizedWeight, Weights
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
elif SYSTEM == "ipex":
|
||||
if SYSTEM == "ipex":
|
||||
from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
|
||||
else:
|
||||
from moe_kernels.fused_moe import fused_moe
|
||||
|
@ -7,7 +7,7 @@ from text_generation_server.utils.import_utils import SYSTEM
|
||||
if SYSTEM == "cuda":
|
||||
import rotary_emb
|
||||
elif SYSTEM == "rocm":
|
||||
from vllm._C import ops
|
||||
import vllm._custom_ops as ops
|
||||
elif SYSTEM == "ipex":
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
|
@ -75,7 +75,7 @@ class CohereRotary(PositionRotaryEmbedding):
|
||||
|
||||
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
|
||||
elif SYSTEM == "rocm":
|
||||
from vllm._C import ops
|
||||
import vllm._custom_ops as ops
|
||||
|
||||
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
|
||||
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
|
||||
|
@ -23,9 +23,7 @@ from typing import Optional, List, Tuple, Any
|
||||
from text_generation_server.layers.attention.kv_cache import get_kv_scales
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
elif SYSTEM == "ipex":
|
||||
if SYSTEM == "ipex":
|
||||
from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
|
||||
else:
|
||||
from moe_kernels.fused_moe import fused_moe
|
||||
|
@ -43,9 +43,9 @@ from text_generation_server.utils.weights import Weights
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
try:
|
||||
from vllm import _custom_C
|
||||
import vllm._custom_ops as ops
|
||||
except Exception as e:
|
||||
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||
raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
|
||||
|
||||
|
||||
class DeepseekV2Config(PretrainedConfig):
|
||||
@ -408,7 +408,7 @@ class DeepseekV2MLP(nn.Module):
|
||||
dtype=hidden_states.dtype,
|
||||
device="cuda",
|
||||
)
|
||||
_custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
|
||||
ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
|
||||
return self.down_proj(out, reduce=reduce)
|
||||
else:
|
||||
gate_up_states = self.gate_up_proj(hidden_states)
|
||||
|
@ -91,7 +91,7 @@ class GPTJRotary(PositionRotaryEmbedding):
|
||||
|
||||
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
|
||||
elif SYSTEM == "rocm":
|
||||
from vllm._C import ops
|
||||
import vllm._custom_ops as ops
|
||||
|
||||
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
|
||||
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
|
||||
|
@ -64,9 +64,9 @@ if SYSTEM != "ipex":
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
try:
|
||||
from vllm import _custom_C
|
||||
import vllm._custom_ops as ops
|
||||
except Exception as e:
|
||||
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||
raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
|
||||
|
||||
|
||||
def load_attention(config, prefix: str, weights, layer_id):
|
||||
@ -392,7 +392,7 @@ class LlamaMLP(nn.Module):
|
||||
dtype=hidden_states.dtype,
|
||||
device="cuda",
|
||||
)
|
||||
_custom_C.LLMM_Silu(
|
||||
ops.LLMM_Silu(
|
||||
self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
|
||||
)
|
||||
return self.down_proj(out, adapter_data)
|
||||
|
@ -49,9 +49,9 @@ from text_generation_server.layers.layernorm import (
|
||||
|
||||
if SYSTEM == "rocm":
|
||||
try:
|
||||
from vllm import _custom_C
|
||||
import vllm._custom_ops as ops
|
||||
except Exception as e:
|
||||
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||
raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
|
||||
|
||||
|
||||
class MistralConfig(PretrainedConfig):
|
||||
@ -318,7 +318,7 @@ class MistralMLP(nn.Module):
|
||||
dtype=hidden_states.dtype,
|
||||
device="cuda",
|
||||
)
|
||||
_custom_C.LLMM_Silu(
|
||||
ops.LLMM_Silu(
|
||||
self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
|
||||
)
|
||||
return self.down_proj(out, adapter_data)
|
||||
|
@ -78,6 +78,7 @@ class RWConfig(PretrainedConfig):
|
||||
self.alibi = False
|
||||
self.rotary = True
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = 2048
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
|
@ -52,7 +52,7 @@ from loguru import logger
|
||||
if SYSTEM == "cuda":
|
||||
import dropout_layer_norm
|
||||
elif SYSTEM == "rocm":
|
||||
from vllm._C import ops
|
||||
import vllm._custom_ops as ops
|
||||
else:
|
||||
dropout_layer_norm = None
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch OPT model."""
|
||||
|
||||
import random
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
@ -99,7 +100,7 @@ class OPTLearnedPositionalEmbedding(nn.Module):
|
||||
self.offset = 2
|
||||
self.weight = nn.Parameter(
|
||||
weights.get_tensor(
|
||||
f"{prefix + '.' if prefix else ''}decoder.embed_positions.weight"
|
||||
f"{prefix if prefix else ''}decoder.embed_positions.weight"
|
||||
)
|
||||
)
|
||||
|
||||
@ -317,7 +318,6 @@ class OPTDecoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.process_group = weights.process_group
|
||||
self.hidden_size = config.hidden_size
|
||||
prefix = f"{prefix + '.' if prefix else ''}decoder.layers.{layer_id}"
|
||||
self.self_attn = OPTAttention(
|
||||
config,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
@ -478,7 +478,12 @@ class OPTDecoder(OPTPreTrainedModel):
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
OPTDecoderLayer(layer_id, prefix, config, weights)
|
||||
OPTDecoderLayer(
|
||||
layer_id,
|
||||
prefix=f"{prefix}decoder.layers.{layer_id}",
|
||||
config=config,
|
||||
weights=weights,
|
||||
)
|
||||
for layer_id in range(config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
@ -755,6 +760,8 @@ class OPTModel(OPTPreTrainedModel):
|
||||
class OPTForCausalLM(OPTPreTrainedModel):
|
||||
def __init__(self, prefix, config, weights):
|
||||
super().__init__(config)
|
||||
if not prefix and any(s.startswith("model") for s in weights.routing.keys()):
|
||||
prefix = "model"
|
||||
|
||||
self.model = OPTModel(prefix, config, weights)
|
||||
|
||||
|
@ -450,7 +450,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
||||
width //= self.spatial_merge_size
|
||||
|
||||
# calculate the length of the text and image tokens
|
||||
text_length = next_image_pos - current_pos
|
||||
text_length = next_image_pos
|
||||
start_idx = (
|
||||
llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
||||
)
|
||||
@ -480,7 +480,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
||||
)
|
||||
llm_pos_ids_list.append(image_pos_ids)
|
||||
|
||||
current_pos = next_image_pos + time_steps * height * width
|
||||
current_pos += next_image_pos + time_steps * height * width
|
||||
image_index += 1
|
||||
|
||||
if current_pos < batch_input_ids.size(1):
|
||||
|
@ -1304,6 +1304,7 @@ class FlashCausalLM(Model):
|
||||
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.num_heads = config.num_attention_heads // self.process_group.size()
|
||||
self.config = config
|
||||
# Validation is done in the model itself
|
||||
if num_kv_heads is None:
|
||||
num_kv_heads = getattr(config, "num_key_value_heads", None)
|
||||
@ -1594,7 +1595,10 @@ class FlashCausalLM(Model):
|
||||
if max_total_tokens is None:
|
||||
if get_support_chunking():
|
||||
model_max_length = self.tokenizer.model_max_length
|
||||
max_total_tokens = min(num_blocks * BLOCK_SIZE, model_max_length)
|
||||
max_position_embeddings = self.config.max_position_embeddings
|
||||
max_total_tokens = min(
|
||||
num_blocks * BLOCK_SIZE, model_max_length, max_position_embeddings
|
||||
)
|
||||
else:
|
||||
max_total_tokens = sum(batch.cache_lengths)
|
||||
|
||||
|
@ -68,7 +68,8 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
|
||||
elif config.model_type == "paligemma":
|
||||
return "<image>" * config.text_config.num_image_tokens
|
||||
elif config.model_type == "qwen2_vl":
|
||||
num_pads = image_input.pixel_values.shape[0] // 4
|
||||
grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
|
||||
num_pads = grid_t * grid_h * grid_w // 4
|
||||
padding = "<|image_pad|>" * num_pads
|
||||
return f"<|vision_start|>{padding}<|vision_end|>"
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user