Merge branch 'main' into rocm-fp8-tensorwise

This commit is contained in:
Mohit Sharma 2024-12-18 12:05:17 +00:00
commit 19688a0617
208 changed files with 2923 additions and 75678 deletions

View File

@ -0,0 +1,75 @@
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
ARG OMPI_VERSION="4.1.7rc1"
# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest AS chef
WORKDIR /usr/src/text-generation-inference/backends/trtllm
FROM chef AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt update && apt install -y \
build-essential \
cmake \
curl \
gcc-14 \
g++-14 \
git \
git-lfs \
libssl-dev \
libucx-dev \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget && \
pipx ensurepath
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
# Install OpenMPI
FROM cuda-builder AS mpi-builder
ARG OMPI_VERSION
ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
mkdir /usr/src/mpi && \
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
cd /usr/src/mpi && \
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
make -j all && \
make install && \
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
# Install TensorRT
FROM cuda-builder AS trt-builder
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
/opt/install_tensorrt.sh
# Build Backend
FROM cuda-builder AS tgi-builder
WORKDIR /usr/src/text-generation-inference
# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
chmod -R a+w /root/.rustup && \
chmod -R a+w /root/.cargo
ENV PATH="/root/.cargo/bin:$PATH"
RUN cargo install cargo-chef
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
ENV MPI_HOME=/usr/local/mpi

View File

@ -0,0 +1,19 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
{
"name": "CUDA",
"build": {
"dockerfile": "Dockerfile_trtllm",
"context": ".."
},
"remoteEnv": {
"PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
},
"customizations" : {
"jetbrains" : {
"backend" : "CLion"
}
}
}

View File

@ -8,6 +8,7 @@ on:
description: Hardware
# options:
# - cuda
# - cuda-trtllm
# - rocm
# - intel
required: true
@ -52,6 +53,15 @@ jobs:
export platform=""
export extra_pytest=""
;;
cuda-trtllm)
export dockerfile="Dockerfile_trtllm"
export label_extension="-trtllm"
export docker_volume="/mnt/cache"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform=""
export extra_pytest=""
;;
rocm)
export dockerfile="Dockerfile_amd"
export label_extension="-rocm"
@ -137,7 +147,7 @@ jobs:
uses: docker/metadata-action@v4.3.0
with:
flavor: |
latest=auto
latest=false
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference

View File

@ -37,7 +37,7 @@ jobs:
# fail-fast is true by default
fail-fast: false
matrix:
hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
uses: ./.github/workflows/build.yaml # calls the one above ^
permissions:
contents: write

69
Cargo.lock generated
View File

@ -2850,20 +2850,6 @@ dependencies = [
"urlencoding",
]
[[package]]
name = "opentelemetry"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.13.0"
@ -2963,24 +2949,6 @@ dependencies = [
"thiserror",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
dependencies = [
"async-trait",
"futures-channel",
"futures-executor",
"futures-util",
"glob",
"once_cell",
"opentelemetry 0.24.0",
"percent-encoding",
"rand",
"thiserror",
]
[[package]]
name = "option-ext"
version = "0.2.0"
@ -4367,9 +4335,8 @@ dependencies = [
[[package]]
name = "text-generation-backends-trtllm"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"async-stream",
"async-trait",
"clap 4.5.21",
"cmake",
@ -4377,21 +4344,19 @@ dependencies = [
"cxx-build",
"hashbrown 0.14.5",
"hf-hub",
"log",
"pkg-config",
"pyo3",
"text-generation-router",
"thiserror",
"tokenizers",
"tokio",
"tokio-stream",
"tracing",
"tracing-opentelemetry 0.25.0",
"tracing-subscriber",
]
[[package]]
name = "text-generation-benchmark"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"average",
"clap 4.5.21",
@ -4411,7 +4376,7 @@ dependencies = [
[[package]]
name = "text-generation-client"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"async-trait",
"base64 0.22.1",
@ -4429,7 +4394,7 @@ dependencies = [
[[package]]
name = "text-generation-launcher"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"clap 4.5.21",
"ctrlc",
@ -4450,7 +4415,7 @@ dependencies = [
[[package]]
name = "text-generation-router"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"anyhow",
"async-stream",
@ -4501,7 +4466,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v2"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"async-stream",
"async-trait",
@ -4550,7 +4515,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v3"
version = "2.4.2-dev0"
version = "3.0.2-dev0"
dependencies = [
"async-stream",
"async-trait",
@ -5086,24 +5051,6 @@ dependencies = [
"web-time 0.2.4",
]
[[package]]
name = "tracing-opentelemetry"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
dependencies = [
"js-sys",
"once_cell",
"opentelemetry 0.24.0",
"opentelemetry_sdk 0.24.1",
"smallvec",
"tracing",
"tracing-core",
"tracing-log 0.2.0",
"tracing-subscriber",
"web-time 1.1.0",
]
[[package]]
name = "tracing-opentelemetry-instrumentation-sdk"
version = "0.16.0"

View File

@ -20,7 +20,7 @@ default-members = [
resolver = "2"
[workspace.package]
version = "2.4.2-dev0"
version = "3.0.2-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -234,6 +234,7 @@ FROM kernel-builder AS vllm-builder
WORKDIR /usr/src
COPY server/Makefile-vllm Makefile
RUN pip install setuptools_scm
# Build specific version of vllm
RUN make build-vllm-rocm
@ -278,9 +279,9 @@ RUN git clone https://github.com/danieldk/marlin-kernels.git && \
FROM kernel-builder AS moe-kernels
WORKDIR /usr/src
ENV MOE_KERNELS_BRANCH=b74b163a2e28042068ac8355e07e6dde926a967a
ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
ENV VLLM_TARGET_DEVICE=rocm
RUN git clone https://github.com/mht-sharma/moe-kernels.git && \
RUN git clone https://github.com/danieldk/moe-kernels.git && \
cd moe-kernels && \
git checkout ${MOE_KERNELS_BRANCH} && \
python setup.py install

View File

@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen
# Text Generation Inference base image for Intel
FROM intel/intel-extension-for-pytorch:2.3.110-xpu AS xpu
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu
USER root
@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit=2024.2.1-98 xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
# Text Generation Inference base env
ENV HF_HOME=/data \
@ -114,15 +114,8 @@ RUN cd server && \
pip install -r requirements_intel.txt && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV CCL_ZE_IPC_EXCHANGE=sockets
ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
#ENV TORCH_LLM_ALLREDUCE=1
#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0

View File

@ -1,5 +1,5 @@
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
ARG OMPI_VERSION="4.1.6"
ARG OMPI_VERSION="4.1.7rc1"
# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest AS chef
@ -10,7 +10,7 @@ COPY . .
RUN cargo chef prepare --recipe-path recipe.json
# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
@ -18,18 +18,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
build-essential \
cmake \
curl \
gcc \
g++ \
gcc-14 \
g++-14 \
git \
git-lfs \
libssl-dev \
libucx-dev \
ninja-build \
pkg-config \
pipx \
python3 \
python3-dev \
python3-setuptools \
tar \
wget
wget && \
pipx ensurepath
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
cd backends/trtllm && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
python3 -m pip install transformers tokenizers
pipx ensurepath && \
pipx install --include-deps transformers tokenizers
WORKDIR /usr/local/tgi/bin
ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""

View File

@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
volume=$PWD/data
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
3.0.0 ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
```
And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1-rocm --model-id $model` instead of the command above.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
```
@ -151,7 +151,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token>
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
```
### A note on Shared Memory (shm)
@ -196,14 +196,26 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
You can also opt to install `text-generation-inference` locally.
First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using `conda`:
First clone the repository and change directoy into it:
```shell
git clone https://github.com/huggingface/text-generation-inference
cd text-generation-inference
```
Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using `conda` or `python venv`:
```shell
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
#using conda
conda create -n text-generation-inference python=3.11
conda activate text-generation-inference
#using pyton venv
python3 -m venv .venv
source .venv/bin/activate
```
You may also need to install Protoc.

BIN
assets/v3_benchmarks.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 209 KiB

View File

@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
endif ()
project(tgi-trtllm-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD 23)
include(FetchContent)
include(ExternalProject)
include(CheckCXXCompilerFlag)
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
@ -29,11 +30,20 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
#### External dependencies ####
include(cmake/fmt.cmake)
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
endif()
# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
endif()
# Let's build TRTLLM as part of CMake
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
@ -41,15 +51,21 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
# $<INSTALL_INTERFACE:csrc>
)
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
message(STATUS "Building tests")
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2
GIT_TAG v3.6.0
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
)
FetchContent_MakeAvailable(Catch2)
# add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
# target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
else()
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
endif ()
if(CMAKE_BUILD_TYPE MATCHES "Debug")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
endif()
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
include(CTest)
include(Catch)
# catch_discover_tests(tgi_trtllm_backend_tests)
catch_discover_tests(tgi_trtllm_backend_tests)
endif ()

View File

@ -7,20 +7,21 @@ homepage.workspace = true
[dependencies]
async-trait = "0.1"
async-stream = "0.3"
#async-stream = "0.3"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
hashbrown = "0.14"
hf-hub = { workspace = true }
log = { version = "0.4", features = [] }
#log = { version = "0.4", features = [] }
text-generation-router = { path = "../../router" }
tokenizers = { workspace = true }
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.15"
thiserror = "1.0.63"
tracing = "0.1"
tracing-opentelemetry = "0.25"
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
#tracing-opentelemetry = "0.25"
#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
pyo3 = { workspace = true }
[build-dependencies]
cmake = "0.1"

View File

@ -4,7 +4,7 @@ use std::env;
use std::env::consts::ARCH;
use std::path::{absolute, PathBuf};
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const CUDA_REQUIRED_VERSION: &str = "12.6";
const MPI_REQUIRED_VERSION: &str = "4.1";
@ -43,7 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
install_path = absolute(out_dir).expect("cannot happen").join(install_path);
}
let _ = cmake::Config::new(".")
let mut config = cmake::Config::new(".");
config
.uses_cxx11()
.generator("Ninja")
.profile(match is_debug {
@ -53,9 +54,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
.env("OPT_LEVEL", opt_level)
.define("CMAKE_INSTALL_PREFIX", &install_path)
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
.define("Python3_ROOT_DIR", "../venv")
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
.build();
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
// Allow to override which Python to use ...
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
config.define("Python3_EXECUTABLE", python3);
}
config.build();
// Additional transitive CMake dependencies
let deps_folder = out_dir.join("build").join("_deps");
@ -90,26 +98,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
CFG.include_prefix = "backends/trtllm";
cxx_build::bridge("src/lib.rs")
.static_flag(true)
.include(deps_folder.join("fmt-src").join("include"))
.std("c++23")
.include(deps_folder.join("spdlog-src").join("include"))
.include(deps_folder.join("json-src").join("include"))
.include(deps_folder.join("trtllm-src").join("cpp").join("include"))
.include("/usr/local/cuda/include")
.include("/usr/local/tensorrt/include")
.file("src/ffi.cpp")
.std("c++20")
.define("NDEBUG", ndebug)
.include("csrc/")
.file("csrc/ffi.hpp")
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
.compile("tgi_trtllm_backend");
println!("cargo:rerun-if-changed=CMakeLists.txt");
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
println!("cargo:rerun-if-changed=cmake/json.cmake");
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
println!("cargo:rerun-if-changed=include/backend.h");
println!("cargo:rerun-if-changed=lib/backend.cpp");
println!("cargo:rerun-if-changed=include/ffi.h");
println!("cargo:rerun-if-changed=src/ffi.cpp");
println!("cargo:rerun-if-changed=csrc/backend.hpp");
println!("cargo:rerun-if-changed=csrc/backend.cpp");
println!("cargo:rerun-if-changed=csrc/hardware.hpp");
println!("cargo:rerun-if-changed=csrc/ffi.hpp");
}
fn main() {

View File

@ -1,6 +0,0 @@
FetchContent_Declare(
fmt
DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
)
FetchContent_MakeAvailable(fmt)

View File

@ -1,6 +1,6 @@
fetchcontent_declare(
json
DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
# DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
)
fetchcontent_makeavailable(json)

View File

@ -1,6 +1,6 @@
set(SPDLOG_USE_FMT ON)
set(SPDLOG_BUILD_SHARED OFF)
set(SPDLOG_FMT_EXTERNAL ON)
set(SPDLOG_FMT_EXTERNAL OFF)
# Define the level at which SPDLOG_ compilation level is defined
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@ -11,7 +11,7 @@ endif ()
fetchcontent_declare(
spdlog
DOWNLOAD_EXTRACT_TIMESTAMP
# DOWNLOAD_EXTRACT_TIMESTAMP
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
)
fetchcontent_makeavailable(spdlog)

View File

@ -11,6 +11,7 @@ set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
set(ENABLE_UCX OFF)
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(FAST_BUILD ON)
set(NVTX_DISABLE OFF)
@ -20,11 +21,13 @@ else ()
set(NVTX_DISABLE ON)
endif ()
find_package(Python3 REQUIRED Interpreter)
fetchcontent_declare(
trtllm
GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
GIT_SHALLOW FALSE
GIT_REPOSITORY https://github.com/huggingface/TensorRT-LLM.git
GIT_TAG 1bb9ca4688805444f203647674bac1d7219d0579
GIT_SHALLOW ON
DOWNLOAD_EXTRACT_TIMESTAMP
)
fetchcontent_makeavailable(trtllm)

View File

@ -0,0 +1,79 @@
#include <ranges>
#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include "backend.hpp"
#include "hardware.hpp"
namespace huggingface::tgi::backends::trtllm {
tle::ParallelConfig backend_workspace_t::parallel_config() const {
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
auto mode = tle::CommunicationMode::kLEADER;
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
if (world_size > 1) {
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
mode = tle::CommunicationMode::kORCHESTRATOR;
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
} else {
SPDLOG_INFO("Detected single engine deployment, using leader mode");
}
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
}
tle::ExecutorConfig backend_workspace_t::executor_config() const {
// Retrieve the compute capabilities to enable some options at runtime
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
// Allocate the config
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
// Set the parallel config as inferred
executor_config.setParallelConfig(parallel_config());
// Define some configuration variables
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
return executor_config;
}
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
size_t backend_t::num_tokens_ready() const noexcept {
return executor_.getNumResponsesReady();
}
std::expected<request_id_t, backend_error_t>
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
return executor_.enqueueRequest(tle::Request {
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
true,
(tle::SamplingConfig) sampling_params,
tle::OutputConfig { /* returnLogProbs= */ true },
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
workspace.generation_config().stop_words
});
}
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
return executor_.awaitResponses();
}
void backend_t::cancel(request_id_t request_id) noexcept {
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
executor_.cancelRequest(request_id);
}
}

View File

@ -0,0 +1,231 @@
#ifndef TGI_BACKEND_TRTLLM
#define TGI_BACKEND_TRTLLM
#include <cmath>
#include <cstdint>
#include <expected>
#include <fstream>
#include <list>
#include <span>
#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include <spdlog/fmt/fmt.h>
#include <tensorrt_llm/executor/executor.h>
namespace huggingface::tgi::backends::trtllm {
namespace tle = tensorrt_llm::executor;
using json = nlohmann::json;
using request_id_t = uint64_t;
using token_id_t = tle::TokenIdType;
/**
* Represent the parameters used for generation
*/
struct generation_params_t {
uint32_t max_new_tokens;
};
/**
* Represent the parameters used to sample tokens from the logit distribution
*/
struct sampling_params_t {
uint32_t top_k;
float_t top_p;
float_t repetition_penalty;
float_t frequency_penalty;
float_t temperature;
uint64_t seed;
constexpr explicit operator tle::SamplingConfig() const {
return tle::SamplingConfig{
1,
top_k,
top_p,
std::nullopt,
std::nullopt,
std::nullopt,
seed,
temperature,
std::nullopt,
std::nullopt,
repetition_penalty,
std::nullopt,
frequency_penalty,
std::nullopt
};
}
};
/**
* Represent possible values from transformers generation `generation_config.json`.
* It usually stores default sampling parameters to use, such as top_p, temperature, etc.
*/
struct generation_config_t {
float_t top_p;
float_t temperature;
std::list<std::vector<int32_t>> stop_words;
constexpr explicit generation_config_t(const json &config) :
top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
const auto &eos_token_id = config["/eos_token_id"_json_pointer];
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
stop_words.emplace_back(1, token_id.template get<int32_t>());
});
SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
}
}
};
/**
* Helper class representing various items which are stored within the TensorRT-LLM engines folder and
* can be retrieved at runtime
*/
class backend_workspace_t {
private:
constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
std::ifstream config_f(path);
return json::parse(config_f);
};
std::filesystem::path engines_folder_;
std::filesystem::path executor_worker_path_;
json config_;
generation_config_t generation_config_;
public:
backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
engines_folder_(engines_folder),
executor_worker_path_(executor_worker_path),
config_(as_json(engines_folder / "config.json")),
generation_config_(as_json(engines_folder / "generation_config.json")) {};
backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
engines_folder_(engines_folder),
executor_worker_path_(executor_worker_path),
config_(as_json(engines_folder / "config.json")),
generation_config_(as_json(engines_folder / "generation_config.json")) {};
/**
* Path to the folder containing the TensorRT-LLM engines
* @return local filesystem path to the folder
*/
[[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }
/**
* Hugging Face transformers' generated `generation_config_t` mapping information stored in the
* `generation_config.json` holding default generation parameters.
* @return `generation_config_t`
*/
[[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }
/**
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
* to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
* @return `tensorrt_llm::executor::ParallelConfig` instance
*/
[[nodiscard]] tle::ParallelConfig parallel_config() const;
/**
* Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
* to initialize `tensorrt_llm::executor::Executor`
* @return `tensorrt_llm::executor::ExecutorConfig` instance
*/
[[nodiscard]] tle::ExecutorConfig executor_config() const;
};
/**
* Error raised by the underlying backend implementation
*/
enum backend_error_t {
EXECUTOR_NOT_READY = 3,
EXECUTOR_SCHEDULING_FAILED = 4,
};
/**
* Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
* - schedule new request
* - pull status of submitted request(s)
* - cancel submitted request(s)
*/
class backend_t {
private:
backend_workspace_t workspace;
tle::Executor executor_;
public:
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
: backend_t(engines_folder, executor_worker_path) {};
/**
* Submit a new request to the executor
* @param token_ids
* @param generation_params
* @param sampling_params
* @return Either newly submitted request's id or the error why it failed to submit
*/
[[nodiscard("Discarded executor request_id needs to be assigned")]]
std::expected<request_id_t, backend_error_t>
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
sampling_params_t sampling_params) noexcept;
/**
* Query the number of tokens available across all in-flight generations
* @return
*/
[[nodiscard("Pulling out the number of tokens")]]
size_t num_tokens_ready() const noexcept;
/**
* Pull out newly generated tokens from the executor
* @return
*/
[[nodiscard("")]]
std::vector<tle::Response> pull_tokens() noexcept;
/**
* Cancel the specified request on the executor' set
* @param request_id Request's Identifier to remove from the in-flight executor
*/
void cancel(request_id_t) noexcept;
};
/**
* Create a TensorRT-LLM executor from a workspace
*/
const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
workspace.executor_config()};
};
}
/**
* Helper structures to define formatting strategies for various types in the backend
*/
template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
format_context &ctx) const -> format_context::iterator {
return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
}
};
template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
format_context &ctx) const -> format_context::iterator {
return fmt::format_to(
ctx.out(),
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
);
}
};
#endif

View File

@ -0,0 +1,162 @@
#ifndef TGI_BACKEND_TRTLLM_FFI
#define TGI_BACKEND_TRTLLM_FFI
#include <memory>
#include <thread>
#include <nvml.h>
#include <tensorrt_llm/common/tllmException.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
#include <spdlog/spdlog.h>
#include <backend.hpp>
#include <hardware.hpp>
namespace rust::behavior {
template<typename Try, typename Fail>
static void trycatch(Try &&func, Fail &&fail) noexcept try {
func();
} catch (tensorrt_llm::common::TllmException &e) {
fail(e.what());
}
}
namespace huggingface::tgi::backends::trtllm {
class tensorrt_llm_backend_t;
}
#include "backends/trtllm/src/lib.rs.h"
namespace huggingface::tgi::backends::trtllm {
std::once_flag backend_initialized_flag;
class tensorrt_llm_backend_t {
private:
backend_t inner_;
public:
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
: inner_(engine_folder, executor_worker_path) {}
size_t num_tokens_ready() const noexcept {
return inner_.num_tokens_ready();
}
request_id_t submit(
rust::Slice<const uint32_t> tokens,
uint32_t max_new_tokens,
uint32_t top_k,
float_t top_p,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed
) {
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
// Submit the request to the executor and get back a potential request_id used to track request status
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
const auto maybe_request_id = inner_.submit(
signed_tokens,
{max_new_tokens},
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
);
// If we do have a value, let's return the request_id
if(maybe_request_id.has_value()) [[likely]] {
return *maybe_request_id;
} else {
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
return maybe_request_id.error();
}
}
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
if(num_tokens_ready() > 0) [[likely]] {
const auto responses = inner_.pull_tokens();
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
// Transform tle::Response to GenerationStep
auto steps = std::make_unique<std::vector<generation_step_t>>();
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) [[likely]] {
const auto result = r.getResult();
return generation_step_t{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return generation_step_t{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
} else {
return std::make_unique<std::vector<generation_step_t>>();
}
}
void cancel(request_id_t requestId) noexcept {
SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
inner_.cancel(requestId);
}
};
void initialize_logging() {
#ifndef TGI_TRTLLM_BACKEND_DEBUG
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (log_level == "debug")
spdlog::set_level(spdlog::level::debug);
else
spdlog::set_level(spdlog::level::info);
}
#else
spdlog::set_level(spdlog::level::debug);
#endif
}
void initialize_tensorrt_llm_backend() {
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
// Initialize everyone
initialize_logging();
nvmlInit_v2();
initTrtLlmPlugins();
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
if (numGpus.has_value()) {
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
} else {
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
// todo: throw
}
}
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
return std::make_unique<tensorrt_llm_backend_t>(
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
);
}
}
#endif

View File

@ -0,0 +1,81 @@
#ifndef TGI_HARDWARE_CUDA
#define TGI_HARDWARE_CUDA
#include <cstdint>
#include <optional>
#include <nvml.h>
namespace huggingface::tgi::hardware::cuda {
static constexpr auto VOLTA = std::make_tuple(7u, 0u);
static constexpr auto TURING = std::make_tuple(7u, 5u);
static constexpr auto AMPERE = std::make_tuple(8u, 0u);
static constexpr auto HOPPER = std::make_tuple(9u, 0u);
static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);
/**
* Get the number of GPUs on the local machine
* @return std::nullopt if no device is available, otherwise >= 1
*/
inline std::optional<size_t> get_device_count() {
uint32_t numGpus = 0;
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
return numGpus;
}
return std::nullopt;
}
/**
* Store information about the version of the CUDA Compute Capabilities detected on the device
*/
struct compute_capabilities_t {
int32_t major;
int32_t minor;
compute_capabilities_t(): compute_capabilities_t(0) {}
explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
nvmlDevice_t device;
if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
}
};
compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}
/**
* Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
* @param sm Architecture version (major, minor)
* @return True if greater or equals to the underlying compute capabilities
*/
[[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }
/**
* Check if the capabilities match at least Volta architecture (sm_70)
* @return true if at least Volta (>= sm_70), false otherwise
*/
[[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }
/**
* Check if the capabilities match at least Turing architecture (sm_75)
* @return true if at least Turing (>= sm_75), false otherwise
*/
[[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }
/**
* Check if the capabilities match at least Ampere architecture (sm_80)
* @return true if at least Ampere (>= sm_80), false otherwise
*/
[[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }
/**
* Check if the capabilities match at least Ada Lovelace architecture (sm_89)
* @return true if at least Ada Lovelace (>= sm_89), false otherwise
*/
[[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }
/**
* Check if the capabilities match at least Hopper architecture (sm_90)
* @return true if at least Hopper (>= sm_90), false otherwise
*/
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
};
}
#endif

View File

@ -1,144 +0,0 @@
//
// Created by Morgan Funtowicz on 6/30/24.
//
#ifndef TGI_TRTLLM_BACKEND_H
#define TGI_TRTLLM_BACKEND_H
#include <array>
#include <cmath>
#include <filesystem>
#include <span>
#include <vector>
#include <nlohmann/json.hpp>
#include <tensorrt_llm/runtime/common.h>
#include <tensorrt_llm/executor/executor.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
using json = nlohmann::json;
namespace tle = tensorrt_llm::executor;
#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
namespace huggingface::tgi::backends {
using RequestId = tle::IdType;
using TokenId = tle::TokenIdType;
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
"Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
"Submitting inference [{}] to the executor ({:d} already in-flight)");
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
"Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
/**
* Initialize all the components required by TRTLLM.
* It is required to call this function before attempting to load any engine
*/
void InitializeBackend();
/**
* Initialize logging mechanism
*/
void InitializeLogging();
/**
*
* @param config TensorRT-LLM configuration object
* @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
* @return
*/
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
/**
*
* @param worldSize
* @param workerPath
* @return
*/
tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
/**
* Get the sampling configuration from the parameters provided by TGI
* @param topK
* @param topP
* @param temperature
* @param repetition_penalty
* @param frequency_penalty
* @param seed
* @return
*/
tle::SamplingConfig GetSamplingConfig(
uint32_t topK,
float_t topP,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed
) noexcept;
/**
* Attempt to retrieve the
* @param generationConfigPath
* @return
*/
std::optional<std::list<std::vector<TokenId>>>
GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
/**
*
*/
class TensorRtLlmBackend {
private:
const json config;
tle::Executor executor;
/** Frequently accessed variables cached here **/
uint32_t maxNumTokens;
std::list<std::vector<TokenId>> stopWords;
public:
explicit TensorRtLlmBackend(
const std::filesystem::path &engineFolder,
const std::filesystem::path &executorWorker
);
/**
* Query the executor for the number of token available for pulling
* @return
*/
[[nodiscard]] size_t NumResponsesReady() const;
/**
* Submit a new generation task to the executor
* @param tokens
* @param topK
* @param topP
* @param temperature
* @param repetitionPenalty
* @param frequencyPenalty
* @param seed
* @return Request id related to this generation for reference
*/
[[nodiscard]] RequestId Submit(
const std::vector<TokenId> &tokens,
uint32_t maxNewTokens,
int32_t topK,
float_t topP,
float_t temperature,
float_t repetitionPenalty,
float_t frequencyPenalty,
uint64_t seed
);
[[nodiscard]] std::vector<tle::Response> PullNewTokens();
};
}
#endif //TGI_TRTLLM_BACKEND_H

View File

@ -1,75 +0,0 @@
//
// Created by mfuntowicz on 7/11/24.
//
#ifndef TGI_TRTLLM_BACKEND_FFI_H
#define TGI_TRTLLM_BACKEND_FFI_H
#include <cmath>
#include <cstddef>
#include <memory>
#include "backend.h"
namespace huggingface::tgi::backends {
class TensorRtLlmBackendImpl;
}
// Template to support returning error from TllmException back to Rust in a Result<>
#include <tensorrt_llm/common/tllmException.h>
namespace rust::behavior {
template<typename Try, typename Fail>
static void trycatch(Try &&func, Fail &&fail) noexcept try {
func();
} catch (tensorrt_llm::common::TllmException &e) {
fail(e.what());
}
}
#include "backends/trtllm/src/lib.rs.h"
namespace huggingface::tgi::backends {
class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
public:
/***
*
* @param engineFolder
* @param executorWorker
*/
TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
/***
*
* @param tokens
* @param maxNewTokens
* @param topK
* @param topP
* @param temperature
* @param repetition_penalty
* @param frequency_penalty
* @param seed
* @return
*/
[[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
uint64_t
Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
int32_t topK, float_t topP, float_t temperature,
float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
/***
*
* @return
*/
std::unique_ptr<std::vector<GenerationStep>> PullTokens();
};
/***
*
* @param engineFolder
* @return
*/
std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
}
#endif //TGI_TRTLLM_BACKEND_FFI_H

View File

@ -1,59 +0,0 @@
//
// Created by mfuntowicz on 7/23/24.
//
#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
#define TGI_TRTLLM_BACKEND_HARDWARE_H
#include <cstdint>
#include <limits>
#include <fmt/base.h>
#include <spdlog/spdlog.h>
#include <nvml.h>
namespace huggingface::hardware::cuda {
#define AMPERE_SM_MAJOR 8
#define HOPPER_SM_MAJOR 9
/**
* Store information about the version of the CUDA Compute Capabilities detected on the device
*/
struct CudaComputeCapabilities {
int32_t major;
int32_t minor;
[[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
[[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
};
CudaComputeCapabilities GetCudaComputeCapabilities() {
// Get the compute capabilities of the current hardware
nvmlDevice_t device;
CudaComputeCapabilities capabilities{0, 0};
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
}
}
return capabilities;
}
/**
* Return the number of GPU detected. If no GPU is detected, return size_t::max()
* @return
*/
std::optional<size_t> GetNumDevices() {
uint32_t numGpus = 0;
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
return std::optional(numGpus);
} else {
return std::nullopt;
}
}
}
#endif //TGI_TRTLLM_BACKEND_HARDWARE_H

View File

@ -1,203 +0,0 @@
#include <cstdlib>
#include <fstream>
#include <fmt/ranges.h>
#include <spdlog/spdlog.h>
#include <nvml.h>
#include "backend.h"
#include "hardware.h"
void huggingface::tgi::backends::InitializeLogging() {
#ifdef NDEBUG
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (log_level == "debug")
spdlog::set_level(spdlog::level::debug);
else
spdlog::set_level(spdlog::level::info);
}
#else
spdlog::set_level(spdlog::level::debug);
#endif
}
void huggingface::tgi::backends::InitializeBackend() {
SPDLOG_INFO("Initializing Backend...");
nvmlInit_v2();
initTrtLlmPlugins();
InitializeLogging();
SPDLOG_INFO("Backend Executor Version: {}", tle::version());
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
if (numGpus.has_value()) {
SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
} else {
SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
}
}
[[nodiscard]]
tle::ParallelConfig
huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
auto mode = tle::CommunicationMode::kLEADER;
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
if (worldSize > 1) {
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
mode = tle::CommunicationMode::kORCHESTRATOR;
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
} else {
SPDLOG_INFO("Detected single engine deployment, using leader mode");
}
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
}
[[nodiscard]]
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
// Retrieve the compute capabilities to enable some options at runtime
const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
// Define some configuration variables
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
return execConfig;
}
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
const uint32_t topK,
const float_t topP,
const float_t temperature,
const float_t repetition_penalty,
const float_t frequency_penalty,
const uint64_t seed) noexcept {
return tle::SamplingConfig(
1, // TGI only use a single beam
topK,
topP,
std::nullopt,
std::nullopt,
std::nullopt,
seed,
temperature,
temperature,
std::nullopt,
repetition_penalty,
std::nullopt,
frequency_penalty
);
}
std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
huggingface::tgi::backends::GetStopWordsFromConfig(
const std::filesystem::path &generationConfigPath) noexcept {
if (exists(generationConfigPath)) {
const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
return {tokenIdObj.template get<tle::TokenIdType>()};
};
std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
return stopWords;
} else {
SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
}
} else {
SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
}
return std::nullopt;
}
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
const std::filesystem::path &enginesFolder,
const std::filesystem::path &executorWorker
) :
config(json::parse(std::ifstream(enginesFolder / "config.json"))),
executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
GetExecutorConfig(config, executorWorker.string())) {
SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
// Ensure we have enough GPUs on the system
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
if (numGpus < worldSize) {
SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
// todo : raise exception to catch on rust side
}
// Cache variables
maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
// Attempt to discover stopWords from the generation_config.json
const auto generationConfigPath = enginesFolder / "generation_config.json";
stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
}
[[nodiscard("Returned number of requests needs to be consumed")]]
size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
#ifdef NDEBUG
return executor.getNumResponsesReady();
#else
const auto numResponses = executor.getNumResponsesReady();
if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
return numResponses;
#endif
}
[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
const std::vector<tle::TokenIdType> &tokens,
const uint32_t maxNewTokens,
const int32_t topK,
const float_t topP,
const float_t temperature,
const float_t repetitionPenalty,
const float_t frequencyPenalty,
const uint64_t seed
) {
const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
#ifndef NDEBUG
{
const auto &iterations = executor.getLatestIterationStats();
const auto &lastIteration = iterations.front();
SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
}
#endif
const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
// Build the request
auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
request.setStopWords(stopWords);
// Submit to the executor for batching
return executor.enqueueRequest(request);
}
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
return executor.awaitResponses();
}

View File

@ -2,7 +2,7 @@
set -ex
TRT_VER_BASE="10.4.0"
TRT_VER_BASE="10.6.0"
TRT_VER_FULL="${TRT_VER_BASE}.26"
CUDA_VER="12.6"
CUDNN_VER="9.5.0.50-1"

View File

@ -1,89 +0,0 @@
//
// Created by mfuntowicz on 6/30/24.
//
#pragma once
#include <algorithm>
#include <exception>
#include <filesystem>
#include <functional>
#include <limits>
#include <iterator>
#include <ranges>
#include <vector>
#include <spdlog/spdlog.h>
#include "backends/trtllm/include/ffi.h"
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
const std::string_view &engineFolder,
const std::string_view &executorWorker
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
rust::Slice<const uint32_t> tokens,
uint32_t maxNewTokens,
int32_t topK,
float_t topP,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed) {
// This will copy all the items from the initial slice
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
return TensorRtLlmBackend::Submit(
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
}
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
const auto responses = TensorRtLlmBackend::PullNewTokens();
auto steps = std::make_unique<std::vector<GenerationStep>>();
steps->reserve(responses.size());
#ifndef NDEBUG
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
#endif
// Transform tle::Response to GenerationStep
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) {
const auto result = r.getResult();
return GenerationStep{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return GenerationStep{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
}
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
SPDLOG_INFO("Creating TensorRT-LLM Backend");
// Unconditionally call this to initialize and discover TRTLLM plugins
InitializeBackend();
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
}

View File

@ -4,10 +4,11 @@ pub mod errors;
mod looper;
mod utils;
#[cxx::bridge(namespace = "huggingface::tgi::backends")]
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
mod ffi {
/// Struct used as shared type between rust and C++ to represent the result
/// of a single decoding iteration
#[cxx_name = "generation_step_t"]
#[derive(Debug, Clone)]
pub struct GenerationStep {
request_id: u64,
@ -19,9 +20,10 @@ mod ffi {
}
unsafe extern "C++" {
include!("backends/trtllm/src/ffi.cpp");
include!("backends/trtllm/csrc/ffi.hpp");
/// Represent an instance of the underlying TensorRT-LLM backend
#[cxx_name = "tensorrt_llm_backend_t"]
type TensorRtLlmBackendImpl;
/// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@ -38,21 +40,18 @@ mod ffi {
/// ```
///
/// ```
#[rust_name = "create_tensorrt_llm_backend"]
fn CreateTensorRtLlmBackend(
fn create_backend_from_engine_folder(
engine_folder: &str,
executor_worker: &str,
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
#[rust_name = "num_responses_ready"]
fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
#[rust_name = "submit"]
fn Submit(
fn submit(
self: Pin<&mut TensorRtLlmBackendImpl>,
tokens: &[u32],
max_new_tokens: u32,
top_k: i32,
top_k: u32,
top_p: f32,
temperature: f32,
repetition_penalty: f32,
@ -60,9 +59,10 @@ mod ffi {
seed: u64,
) -> Result<u64>;
#[rust_name = "pull_tokens"]
fn PullTokens(
fn pull_tokens(
self: Pin<&mut TensorRtLlmBackendImpl>,
) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
}
}

View File

@ -1,14 +1,13 @@
use std::hint;
use std::ops::Deref;
use std::path::Path;
use async_trait::async_trait;
use cxx::UniquePtr;
use hashbrown::HashMap;
use std::hint;
use std::ops::Deref;
use std::path::Path;
use tokenizers::Tokenizer;
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
use tokio::sync::TryAcquireError;
use tokio::task::{spawn_blocking, JoinHandle};
use tokio::task::spawn_blocking;
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{debug, error, warn};
@ -22,7 +21,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
use text_generation_router::{FinishReason, Token};
use crate::errors::TensorRtLlmBackendError;
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
use crate::utils::first_line;
type InferResult<T> = Result<T, InferError>;
@ -30,9 +29,10 @@ type InferResult<T> = Result<T, InferError>;
/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
struct GenerationContext {
request: ValidGenerateRequest,
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
tokens: Vec<u32>,
start: Option<Instant>,
queued: Instant,
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
}
#[derive(Debug, Copy, Clone)]
@ -58,31 +58,22 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
}
}
/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
struct DecodedTokenContext {
token: DecodedToken,
start: Option<Instant>,
queued: Instant,
channel: UnboundedSender<InferResult<InferStreamResponse>>,
}
fn executor_status_looper(
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
max_inflight_requests: usize,
mut waiting_requests: UnboundedReceiver<GenerationContext>,
post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
tokenizer: Tokenizer,
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
mut backlog: UnboundedReceiver<GenerationContext>,
) {
// Track the tuple (request_id, stream) for each request
let mut in_flights =
HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
// TODO: Does it need a spin-loop?
'scheduler: loop {
// Is there any request pending to be scheduled?
let awaiting_requests = waiting_requests.len();
let awaiting_requests = backlog.len();
for _ in 0..awaiting_requests {
// Retrieve all the requests
if let Some(mut ctx) = waiting_requests.blocking_recv() {
if let Some(ctx) = backlog.blocking_recv() {
// Submit all the request to the executor and move the context to the in-flight tracker
let request = &ctx.request;
let generation_params = &request.parameters;
@ -93,7 +84,7 @@ fn executor_status_looper(
match backend.pin_mut().submit(
&input_ids.unwrap(), // This is checked beforehand in validate()
stopping_params.max_new_tokens,
generation_params.top_k as i32,
generation_params.top_k,
generation_params.top_p,
generation_params.temperature,
generation_params.repetition_penalty,
@ -103,7 +94,6 @@ fn executor_status_looper(
Ok(request_id) => {
// Insert the context linked to the generated request id in the tracker
debug!("[in-flight] Added {}", request_id);
ctx.start = Some(Instant::now());
in_flights.insert(request_id, ctx);
}
Err(e) => {
@ -117,29 +107,43 @@ fn executor_status_looper(
}
}
};
} else {
break 'scheduler;
}
}
if backend.num_responses_ready() > 0 {
match backend.pin_mut().pull_tokens() {
if backend.num_tokens_ready() > 0 {
let mut backend = backend.pin_mut();
match backend.as_mut().pull_tokens() {
Ok(responses) => {
// Iterate through all the decoded token
for step in responses.deref() {
if let Some(ctx) = in_flights.get(&step.request_id) {
// Remove from tracked requests
let parcel =
DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
token: dt,
start: ctx.start,
queued: ctx.queued,
channel: ctx.streamer.clone(),
});
if let Some(ctx) = in_flights.get_mut(&step.request_id) {
// Update the starting timestamp if not set
// This value might not be the actual real starting time of the request
// on the executor side - Need to expose more info from the executor to
// retrieve this value
// TODO : Expose actual real starting time for a request on FFI layer
if ctx.start.is_none() {
ctx.start = Some(Instant::now());
}
// Submit the work to p:the post_processor
let posted = post_processor_sender.send((step.request_id, parcel));
// Try to map the generation step to a DecodedToken
let response = match DecodedToken::try_from(step) {
Ok(decoded_token) => {
post_process_decoded_token(&tokenizer, ctx, decoded_token)
}
Err(err) => Err(err),
};
if posted.is_err() || step.is_final {
debug!("Removing {}", step.request_id);
// Attempt to send back the response to the client
if let Err(_) = ctx.streamer.send(response) {
// Client has dropped, remove from tracked requests
debug!(
"Client dropped - removing request {} from tracked requests",
step.request_id
);
backend.as_mut().cancel(step.request_id);
let _ = in_flights.remove(&step.request_id);
}
} else {
@ -159,54 +163,36 @@ fn executor_status_looper(
}
}
fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
tokenizer: Tokenizer,
max_inflight_requests: usize,
mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
) {
let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
'post_processor: loop {
if decoded_tokens.is_closed() {
warn!("Post processor IPC is closed, loop will exit now.");
break 'post_processor;
}
if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
match decoded {
Ok(ctx) => {
states
.entry(request_id)
.and_modify(|s| s.push(*&ctx.token.id))
.or_insert_with(|| {
let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
state.push(*&ctx.token.id);
state
});
let out = match tokenizer.decode(&[ctx.token.id], false) {
fn post_process_decoded_token(
tokenizer: &Tokenizer,
ctx: &mut GenerationContext,
decoded_token: DecodedToken,
) -> InferResult<InferStreamResponse> {
match tokenizer.decode(&[decoded_token.id], false) {
Ok(text) => {
let is_special =
tokenizer.get_added_vocabulary().is_special_token(&text);
let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
let token = Token {
id: ctx.token.id,
id: decoded_token.id,
text,
logprob: ctx.token.log_prob,
logprob: decoded_token.log_prob,
special: is_special,
};
let out = if !ctx.token.is_final {
// Append the token to the tracked generated tokens
ctx.tokens.push(token.id);
// Map the correct response depending on the step is final or not
let out = if !decoded_token.is_final {
InferStreamResponse::Intermediate {
token,
top_tokens: vec![],
}
} else {
let tokens = states.remove(&request_id).unwrap();
let text = tokenizer.decode(&tokens, true);
let text = tokenizer.decode(&ctx.tokens, true);
let generated_text = GeneratedText {
text: text.unwrap(),
generated_tokens: tokens.len() as u32,
finish_reason: FinishReason::EndOfSequenceToken,
generated_tokens: ctx.tokens.len() as u32,
finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
seed: None,
};
@ -222,17 +208,6 @@ fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
Ok(out)
}
Err(err) => Err(GenerationError(err.to_string())),
};
if let Err(_) = ctx.channel.send(out) {
warn!("Failed to send decoded token back to the user")
}
}
Err(_err) => {
todo!("what do we do?")
}
}
}
}
}
@ -277,11 +252,7 @@ fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
unsafe impl Send for TensorRtLlmBackendImpl {}
pub struct TensorRtLlmBackendV2 {
executor_looper: JoinHandle<()>,
post_processor_looper: JoinHandle<()>,
executor: UnboundedSender<GenerationContext>,
}
pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
impl TensorRtLlmBackendV2 {
pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
@ -295,32 +266,17 @@ impl TensorRtLlmBackendV2 {
// Allocate the IPC layer to communicate with the backend
let (executor_sender, executor_receiver) = unbounded_channel();
let (post_processor_sender, post_processor_receiver) = unbounded_channel();
// Create the FFI backend
let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
// Executor looper is responsible for scheduling and pulling requests state at regular interval
let executor_looper = spawn_blocking(move || {
executor_status_looper(
backend,
max_inflight_requests,
executor_receiver,
post_processor_sender,
)
spawn_blocking(move || {
executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
});
// Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
let post_processor_looper = spawn_blocking(move || {
post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
});
Ok(TensorRtLlmBackendV2 {
executor_looper,
post_processor_looper,
executor: executor_sender,
})
Ok(TensorRtLlmBackendV2(executor_sender))
}
fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
@ -354,20 +310,21 @@ impl TensorRtLlmBackendV2 {
impl Backend for TensorRtLlmBackendV2 {
fn schedule(
&self,
inner: ValidGenerateRequest,
request: ValidGenerateRequest,
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
Self::validate(&inner)?;
Self::validate(&request)?;
// Open-up the stream to send tokens
let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
// Send the context to the executor for scheduling
let queued = Instant::now();
match self.executor.send(GenerationContext {
request: inner,
match self.0.send(GenerationContext {
request,
streamer,
tokens: Vec::with_capacity(256),
start: None,
queued,
streamer,
}) {
Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
Err(_) => Err(GenerationError(
@ -377,6 +334,6 @@ impl Backend for TensorRtLlmBackendV2 {
}
async fn health(&self, _: bool) -> bool {
!self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
true
}
}

View File

@ -3,14 +3,15 @@ use std::path::{Path, PathBuf};
use clap::Parser;
use hf_hub::api::tokio::{Api, ApiBuilder};
use hf_hub::{Cache, Repo, RepoType};
use tokenizers::Tokenizer;
use tracing::info;
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
use text_generation_backends_trtllm::TensorRtLlmBackendV2;
use text_generation_router::server::get_base_tokenizer;
use text_generation_router::server::{
get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
};
use text_generation_router::usage_stats::UsageStatsLevel;
use text_generation_router::{server, HubTokenizerConfig};
use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
/// App Configuration
#[derive(Parser, Debug)]
@ -61,7 +62,7 @@ struct Args {
#[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
executor_worker: PathBuf,
#[clap(default_value = "on", long, env)]
usage_stats: usage_stats::UsageStatsLevel,
usage_stats: UsageStatsLevel,
#[clap(default_value = "2000000", long, env)]
payload_limit: usize,
}
@ -126,18 +127,18 @@ async fn get_tokenizer(
// Load tokenizer and model info
let (
tokenizer_filename,
_config_filename,
tokenizer_config_filename,
config_filename,
_tokenizer_config_filename,
_preprocessor_config_filename,
_processor_config_filename,
_model_info,
) = match api {
Type::None => (
Some(local_path.join("tokenizer.json")),
Some(local_path.join("config.json")),
Some(local_path.join("tokenizer_config.json")),
Some(local_path.join("preprocessor_config.json")),
Some(local_path.join("processor_config.json")),
None,
),
Type::Api(api) => {
let api_repo = api.repo(Repo::with_revision(
@ -146,21 +147,23 @@ async fn get_tokenizer(
revision.unwrap_or_else(|| "main").to_string(),
));
let tokenizer_filename = match api_repo.get("tokenizer.json").await {
Ok(tokenizer_filename) => Some(tokenizer_filename),
Err(_) => get_base_tokenizer(&api, &api_repo).await,
};
let config_filename = api_repo.get("config.json").await.ok();
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
let processor_config_filename = api_repo.get("processor_config.json").await.ok();
let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
Some(model_info)
} else {
tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
None
};
(
tokenizer_filename,
config_filename,
tokenizer_config_filename,
preprocessor_config_filename,
processor_config_filename,
model_info,
)
}
Type::Cache(cache) => {
@ -170,24 +173,55 @@ async fn get_tokenizer(
revision.clone().unwrap_or_else(|| "main").to_string(),
));
(
repo.get("tokenizer.json"),
repo.get("config.json"),
repo.get("tokenizer_config.json"),
repo.get("preprocessor_config.json"),
repo.get("processor_config.json"),
None,
)
}
};
// Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
{
HubTokenizerConfig::from_file(filename)
// let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
// {
// HubTokenizerConfig::from_file(filename)
// } else {
// tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
// };
// let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
// tracing::warn!("Could not find tokenizer config locally and no API specified");
// HubTokenizerConfig::default()
// });
let tokenizer: Tokenizer = {
use pyo3::prelude::*;
pyo3::Python::with_gil(|py| -> PyResult<()> {
py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
Ok(())
})
.inspect_err(|err| {
tracing::error!("Failed to import python tokenizer {err}");
})
.or_else(|err| {
let out = legacy_tokenizer_handle(config_filename.as_ref());
out.ok_or(err)
})
.expect("We cannot load a tokenizer");
let filename = "out/tokenizer.json";
if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
Tokenizer::Rust(tok)
} else {
tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
Tokenizer::Python {
tokenizer_name: tokenizer_name.to_string(),
revision: revision.map(|revision| revision.to_string()),
trust_remote_code: false,
}
}
};
tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
Some(tokenizer)
}
#[tokio::main]
@ -258,14 +292,18 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
}
// Create the backend
let tokenizer = get_tokenizer(
match get_tokenizer(
&tokenizer_name,
tokenizer_config_path.as_deref(),
revision.as_deref(),
)
.await
.expect("Failed to retrieve tokenizer implementation");
.expect("Failed to retrieve tokenizer implementation")
{
Tokenizer::Python { .. } => Err(TensorRtLlmBackendError::Tokenizer(
"Failed to retrieve Rust based tokenizer".to_string(),
)),
Tokenizer::Rust(tokenizer) => {
info!("Successfully retrieved tokenizer {}", &tokenizer_name);
let backend = TensorRtLlmBackendV2::new(
tokenizer,
@ -304,4 +342,6 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
)
.await?;
Ok(())
}
}
}

View File

@ -1,14 +0,0 @@
//
// Created by mfuntowicz on 7/2/24.
//
#include <catch2/catch_all.hpp>
#include <spdlog/spdlog.h>
#include "../include/backend.h"
TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
spdlog::info("Loading config from: {}", absolute(engines).string());
huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
}

View File

@ -0,0 +1,152 @@
//
// Created by mfuntowicz on 12/3/24.
//
#include <catch2/catch_all.hpp>
#include <nlohmann/json.hpp>
#include <tensorrt_llm/executor/executor.h>
#include "backend.hpp"
using namespace huggingface::tgi::backends::trtllm;
TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
{
const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
const auto generation_config = generation_config_t(config_j);
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(0.95, 1e-6));
// Stop words
REQUIRE_FALSE(generation_config.stop_words.empty());
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
{
// Currently we do not support multi-tokens stop words
REQUIRE(lhs.size() == 1);
REQUIRE(rhs.size() == 1);
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
}
}
TEST_CASE("parse generation_config.json default", "[generation_config_t]")
{
const json config_j = {{"eos_token_id", {1,2,3}}};
const auto generation_config = generation_config_t(config_j);
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE_FALSE(generation_config.stop_words.empty());
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
{
// Currently we do not support multi-tokens stop words
REQUIRE(lhs.size() == 1);
REQUIRE(rhs.size() == 1);
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
}
}
TEST_CASE("parse generation_config.json empty", "[generation_config_t]")
{
const json config_j = {{"eos_token_id", {}}};
const auto generation_config = generation_config_t(config_j);
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE(generation_config.stop_words.empty());
const json config_j2 = {};
const auto generation_config2 = generation_config_t(config_j);
REQUIRE_THAT(generation_config2.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE_THAT(generation_config2.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
REQUIRE(generation_config2.stop_words.empty());
}
TEST_CASE("parallel_config single", "[backend_workspace_t]")
{
// Generate temporary folder
const auto tmp_p = std::filesystem::temp_directory_path();
const auto config_p = tmp_p / "config.json";
const auto generation_config_p = tmp_p / "generation_config.json";
// Generate content
std::ofstream o_config(config_p);
o_config << R"({"pretrained_config": {"mapping": {"world_size": 2}}})"_json;
o_config.close();
std::ofstream o_generation_config(generation_config_p);
o_generation_config << R"({"eos_token_id": []})"_json;
o_generation_config.close();
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
const auto parallel = workspace.parallel_config();
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kORCHESTRATOR);
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
std::filesystem::remove(config_p);
std::filesystem::remove(generation_config_p);
}
TEST_CASE("parallel_config multi", "[backend_workspace_t]")
{
// Generate temporary folder
const auto tmp_p = std::filesystem::temp_directory_path();
const auto config_p = tmp_p / "config.json";
const auto generation_config_p = tmp_p / "generation_config.json";
// Generate content
std::ofstream o_config(config_p);
o_config << R"({"pretrained_config": {"mapping": {"world_size": 1}}})"_json;
o_config.close();
std::ofstream o_generation_config(generation_config_p);
o_generation_config << R"({"eos_token_id": []})"_json;
o_generation_config.close();
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
const auto parallel = workspace.parallel_config();
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kLEADER);
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
std::filesystem::remove(config_p);
std::filesystem::remove(generation_config_p);
}
TEST_CASE("executor_config", "[backend_workspace_t]")
{
}
TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
{
const sampling_params_t params = {40, 0.95, 0.9, 1.0, 0.6, 2014};
const auto config = static_cast<tle::SamplingConfig>(params);
REQUIRE(config.getTopK().has_value());
REQUIRE(config.getTopK().value() == params.top_k);
REQUIRE(config.getSeed().has_value());
REQUIRE(config.getSeed().value() == params.seed);
REQUIRE(config.getTopP().has_value());
REQUIRE_THAT(*config.getTopP(), Catch::Matchers::WithinAbs(params.top_p, 1e-6f));
REQUIRE(config.getRepetitionPenalty().has_value());
REQUIRE_THAT(*config.getRepetitionPenalty(), Catch::Matchers::WithinAbs(params.repetition_penalty, 1e-6f));
REQUIRE(config.getFrequencyPenalty().has_value());
REQUIRE_THAT(*config.getFrequencyPenalty(), Catch::Matchers::WithinAbs(params.frequency_penalty, 1e-6f));
REQUIRE(config.getTemperature().has_value());
REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
}

View File

@ -0,0 +1,82 @@
//
// Created by mfuntowicz on 11/16/24.
//
#include <catch2/catch_all.hpp>
#include "../csrc/hardware.hpp"
using namespace huggingface::tgi::hardware::cuda;
TEST_CASE("is_at_least_<arch>") {
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
REQUIRE(VOLTA_CAPABILITIES.is_at_least_volta());
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_turing());
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ampere());
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ada_lovelace());
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_hopper());
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
REQUIRE(TURING_CAPABILITIES.is_at_least_volta());
REQUIRE(TURING_CAPABILITIES.is_at_least_turing());
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ampere());
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ada_lovelace());
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_hopper());
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
REQUIRE(AMPERE_CAPABILITIES.is_at_least_volta());
REQUIRE(AMPERE_CAPABILITIES.is_at_least_turing());
REQUIRE(AMPERE_CAPABILITIES.is_at_least_ampere());
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_ada_lovelace());
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_hopper());
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_volta());
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_turing());
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ampere());
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ada_lovelace());
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least_hopper());
const static auto HOPPER_CAPABILITIES = compute_capabilities_t(9, 0);
REQUIRE(HOPPER_CAPABILITIES.is_at_least_volta());
REQUIRE(HOPPER_CAPABILITIES.is_at_least_turing());
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ampere());
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ada_lovelace());
REQUIRE(HOPPER_CAPABILITIES.is_at_least_hopper());
}
TEST_CASE("is_at_least") {
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
REQUIRE(VOLTA_CAPABILITIES.is_at_least(VOLTA));
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(TURING));
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(AMPERE));
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(ADA_LOVELACE));
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(HOPPER));
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
REQUIRE(TURING_CAPABILITIES.is_at_least(VOLTA));
REQUIRE(TURING_CAPABILITIES.is_at_least(TURING));
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(AMPERE));
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(ADA_LOVELACE));
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(HOPPER));
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
REQUIRE(AMPERE_CAPABILITIES.is_at_least(VOLTA));
REQUIRE(AMPERE_CAPABILITIES.is_at_least(TURING));
REQUIRE(AMPERE_CAPABILITIES.is_at_least(AMPERE));
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(ADA_LOVELACE));
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(HOPPER));
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(VOLTA));
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(TURING));
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(AMPERE));
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(ADA_LOVELACE));
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least(HOPPER));
const static auto HOPPER_CAPABILITIES = compute_capabilities_t (9, 0);
REQUIRE(HOPPER_CAPABILITIES.is_at_least(VOLTA));
REQUIRE(HOPPER_CAPABILITIES.is_at_least(TURING));
REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
}

View File

@ -104,6 +104,10 @@ impl Backend for BackendV2 {
}
.is_ok()
}
fn start_health(&self) -> bool {
true
}
}
/// Batching logic

View File

@ -436,6 +436,7 @@ mod tests {
stopping_parameters: ValidStoppingParameters {
ignore_eos_token: false,
max_new_tokens: 1,
max_total_new_tokens: 1024,
stop_sequences: vec![],
},
top_n_tokens: 0,

View File

@ -111,6 +111,10 @@ impl Backend for BackendV3 {
}
.is_ok()
}
fn start_health(&self) -> bool {
true
}
}
/// Batching logic

View File

@ -217,8 +217,8 @@ impl Health for ShardedClient {
input_chunks: Some(Input {
chunks: vec![Chunk::Text("liveness".into()).into()],
}),
truncate: 10,
add_special_tokens: true,
truncate: 1,
add_special_tokens: false,
prefill_logprobs: false,
parameters: Some(NextTokenChooserParameters {
temperature: 1.0,
@ -241,7 +241,7 @@ impl Health for ShardedClient {
top_n_tokens: 0,
// Block 0 is reserved for health checks
blocks: vec![0],
slots: (0..16).collect(),
slots: vec![0],
cache_len: 0,
adapter_id: None,
chunk_len: None,

View File

@ -573,6 +573,7 @@ mod tests {
stopping_parameters: ValidStoppingParameters {
ignore_eos_token: false,
max_new_tokens: 1,
max_total_new_tokens: 1024,
stop_sequences: vec![],
},
top_n_tokens: 0,

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "2.4.2-dev0"
"version": "3.0.2-dev0"
},
"paths": {
"/": {
@ -1013,6 +1013,7 @@
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"default": "1024",
"example": "32",
"nullable": true,
"minimum": 0
@ -1329,7 +1330,8 @@
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"default": "32",
"default": "1024",
"example": "32",
"nullable": true,
"minimum": 0
},
@ -1591,7 +1593,7 @@
"type": "integer",
"format": "int32",
"description": "Maximum number of tokens to generate.",
"default": "100",
"default": "1024",
"example": "20",
"nullable": true,
"minimum": 0

View File

@ -17,6 +17,8 @@
title: Using TGI with Intel GPUs
- local: installation
title: Installation from source
- local: multi_backend_support
title: Multi-backend support
- local: architecture
title: Internal Architecture
@ -45,6 +47,10 @@
- local: basic_tutorials/train_medusa
title: Train Medusa
title: Tutorials
- sections:
- local: backends/trtllm
title: TensorRT-LLM
title: Backends
- sections:
- local: reference/launcher
title: All TGI CLI options
@ -54,6 +60,8 @@
title: API Reference
title: Reference
- sections:
- local: conceptual/chunking
title: V3 update, caching and chunking
- local: conceptual/streaming
title: Streaming
- local: conceptual/quantization

View File

@ -9,8 +9,10 @@ A high-level architecture diagram can be seen here:
This diagram shows well there are these separate components:
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
- **The model server**, responsible for receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
Note that for other backends (eg. TRTLLM) the model server and launcher are specific to the backend.
The router and the model server can be two different machines, they do not need to be deployed together.

View File

@ -0,0 +1,81 @@
# TensorRT-LLM backend
The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
that uses NVIDIA's TensorRT library for inference acceleration.
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
To use the TRTLLM backend you need to compile `engines` for the models you want to use.
Each `engine` must be compiled on the same GPU architecture that you will use for inference.
## Supported models
Check the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) to see which models are
supported.
## Compiling engines
You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
want to use.
```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
# Install huggingface_cli
python -m pip install huggingface-cli[hf_transfer]
# Login to the Hugging Face Hub
huggingface-cli login
# Create a directory to store the model
mkdir -p /tmp/models/$MODEL_NAME
# Create a directory to store the compiled engine
mkdir -p /tmp/engines/$MODEL_NAME
# Download the model
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
# Compile the engine using Optimum-NVIDIA
docker run \
--rm \
-it \
--gpus=1 \
-v /tmp/models/$MODEL_NAME:/model \
-v /tmp/engines/$MODEL_NAME:/engine \
huggingface/optimum-nvidia \
optimum-cli export trtllm \
--tp=1 \
--pp=1 \
--max-batch-size=128 \
--max-input-length 4096 \
--max-output-length 8192 \
--max-beams-width=1 \
--destination /engine \
$MODEL_NAME
```
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
## Using the TRTLLM backend
Run TGI-TRTLLM Docker image with the compiled engine:
```bash
docker run \
--gpus 1 \
-it \
--rm \
-p 3000:3000 \
-e MODEL=$MODEL_NAME \
-e PORT=3000 \
-e HF_TOKEN='hf_XXX' \
-v /tmp/engines/$MODEL_NAME:/data \
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
--executor-worker executorWorker \
--model-id /data/$MODEL_NAME
```
## Development
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
`.devcontainer` directory.

View File

@ -19,6 +19,6 @@ docker run --gpus all \
--shm-size 1g \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 \
--model-id $model
```

View File

@ -0,0 +1,125 @@
# TGI v3 overview
## Summary
Performance leap: TGI processes 3x more tokens, 13x faster than vLLM on long prompts. Zero config !
### 3x more tokens.
By reducing our memory footprint, were able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effect are best seen on smaller constrained environments.
### 13x faster
On long prompts (200k+ tokens) conversation replies take 27.5s in vLLM, while it takes only 2s in TGI. How so ? We keep the initial conversation around, so when a new reply comes in, we can answer almost instantly. The overhead of the lookup is ~5us. Thanks @Daniël de Kok for the beast data structure.
### Zero config
Thats it. Remove all the flags your are using and youre likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we dont have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.
## Benchmarks
### Methodology
To ensure accurate and reliable results, we employed a robust benchmarking protocol that addresses common pitfalls in performance evaluation. Specifically:
1. **Consistent Code**: We used the same codebase to run against different engines, ensuring that any performance differences are attributable to the LLM itself, rather than variations in the testing framework.
2. **Request-Based Measurement**: Instead of measuring Requests Per Second (RPS) by sending as many requests as possible, we opted for a more consistent approach, sending a fixed number of requests and measuring the time it takes for the server to complete all of them. This method avoids boundary effects and provides a more accurate representation of performance.
3. **Realistic Combinations**: We selected realistic combinations of LLMs and hardware configurations so we used 8xH100 for a 70B, not a 8B, which would be a waste of money.
4. **Realistic scenarios** We benchmarked engines with prefix caching on, so we are reporting the results of the 2nd run, not the first one.
During the first run of a benchmark, every request is new, so prefix caching is not working, masking the real world benefits of using it.
Note: Boundary effect is when the benchmarks are flaky because their results depend on fine details of the engine being benchmarked.
For instance, a system ingesting a constant 10RPS, but receiving in the benchmark a single final request at -0.1s before the end of the benchmark, and that single request takes a full 10s to process. Then a benchmark taking 30s would measure 7.5RPS instead of the expected 10, because that single query isn't being parallelized with others. Another very slightly slower engine would receive that request at +0.1s which would get discarded by the benchmark and therefore measure the slower system as being faster.
For more details on benchmarking in general we recommend the documentation of k6: https://grafana.com/docs/k6/latest/.
### Scenarios
We selected a handful of scenarios to simplify the picture, they seem to accurately reflect a larger trend.
1. **Small scenario**: This scenario consists of the first 200 requests from the orca datasets being prompted to the model. The 200 requests total 8k tokens together and are representative of conversation starters. Prefix caching has very limited impact in that scenario and we feel it's a relatively balanced benchmark for simple use cases.
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enable ever larger prompts and ever faster caching.
### Hardware
1. `L4` : This is a single L4 (24GB) which represents small or even home compute capabilities. We tested `meta-llama/Meta-Llama-3.1-8B-Instruct` on it.
2. `4xL4`: This is a more beefy deployment usually used for either very large requests deployments for 8B models (the ones under test) or it can also easily handle all 30GB models. For this benchmark we tested `meta-llama/Meta-Llama-3.1-8B-Instruct`
3. `8xH100` This is one of the beefiest deployments possible. We tested `meta-llama/Meta-Llama-3.1-70B-Instruct` as it's the most representative models of this size. Llama 3.3 wasn't released at the time of benchmarking (it's the exact same model so it doesn't make any difference).
### Replicating the results
The commands to run the benchmarks are as follows:
1. Prepare the datasets:
```bash
cd text-generation-inference/load_tests
make prepare_orca
python long.py
```
2. Launch the engine:
TGI: `text-generation-launcher --model-id $MODEL_ID --num-shard $N --port 8000` (or docker variant)
vLLM: `vllm serve $MODEL_ID --tensor-parallel $N —enable-prefix-caching` (or docker variant)
3. Start scenario:
Small: `MODEL_ID=$MODEL_ID HOST=localhost:8000 k6 run load_tests/common.js`
Long: `MODEL_ID=$MODEL_ID HOST=localhost:8000 k6 run load_tests/long.js`
### Results
![benchmarks_v3](https://raw.githubusercontent.com/huggingface/text-generation-inference/refs/heads/main/assets/v3_benchmarks.png)
Our benchmarking results show significant performance gains, with a 13x speedup over vLLM with prefix caching, and up to 30x speedup without prefix caching. These results are consistent with our production data and demonstrate the effectiveness of our optimized LLM architecture.
Raw results
| | | | | |
|---|---|---|---|---|
|2nd run ||**TGI v3** (time in s)|**vLLM** (s)|**Amount of req**|
|**Llama 3.1 8b**|Small test - L4 - 8B|17.5|19.9|200|
|**Llama 3.1 8b**|Long test* - L4 - 8B|53|57|10|
|**Llama 3.1 8b**|Small test - 4xL4 - 8B|4.8|6|200|
|**Llama 3.1 8b**|Long test - 4xL4 - 8B|3.2|12.5|20|
|**Llama 3.1 70b**|Small test - 8XH100 - 70B|6.2|7.4|200|
|**Llama 3.1 70b**|Long test - 8H100 - 70B|2|27.5|20|
||||||
|1st run ||TGI (s)|vLLM (s)|Amount of req|
|**Llama 3.1 8b**|Small test - L4|19.9|19.9|200|
|**Llama 3.1 8b**|Long test (10) - L4|49.8|55|10|
|**Llama 3.1 8b**|Small test - 4xL4|13|12.6|200|
|**Llama 3.1 8b**|Long test - 4xL4|47|50.3|20|
|**Llama 3.1 70b**|Small test - 8XH100|7.5|7.6|200|
|**Llama 3.1 70b**|Long test - 8H100|12.1|28.3|20|
### Caveats and Limitations
While our results are promising, there are some caveats to consider:
1. **Constrained kv-cache**: If a deployment lacks kv-cache space, that means that many queries will require the same slots of kv-cache, leading to contention in the kv-cache. You can limit that effect by limiting `--max-total-tokens` to reduce individual queries impact. You can also use more GPUs or larger GPUs in order to increase the size of the kv-cache.
2. **Replication**: In scenarios where multiple replicas are behind a single endpoint, there's no reason for every query from a particular user to hit the same replica, therefore the cache will not be present, meaning no speed benefit. You can use sticky sessions load balancing to force every user to send their requests on the same replica. Do not apply this blindly, it's possible this may not be necessary at all.
## Technical Insights
Our performance gains can be attributed to several key factors:
1. **New Kernels**: Our custom kernels, including `flashinfer` and `flashdecoding`, offer improved performance at large prompt lengths and enable more efficient scheduling.
2. **Prefix Caching**: Our optimized prefix caching structure allows for fast query matching, even for long prompts. The overhead is roughly 6us.
3. **Chunking Code**: Our chunking code enables finer control over compute resources, ensuring optimal performance and reduced VRAM usage.
4. **Kernel Optimizations**: We've implemented various other kernel optimizations, including better kernel selection. Notably we've implemented several small kernels involved in the queries bookkeeping which are particularly efficient on small models. Every kernel launch has an overhead of several milliseconds so fusing them together increases a lot performance when this bookkeeping is important relative to the raw model calculations. This happens typically on oversized compute for a particular model and particularly small models.
5. **VRAM efficiency**: In the realm of very large requests (100k+ tokens) there are a lot of places which start becoming big memory consumers. We've hunted the biggest ones and found ways to reduce/reuse or delete them. The biggest culprit probably is `logits` calculation. Logits for llama 3.1-8b take 25.6GB (=100k tokens * 128k vocabulary * 2(f16)) which is more than the full model which is 16GB. The thing is that in general we do not need every prompt logits, so we simply removed them and removed them from being potentially asked by users by default. We think this is ok since they are mostly used by researchers. You can enable your deployments to have them again by using the `--enable-prefill-logprobs` flag, but you will experience reduced token prompt size.
## Future Directions
While we've made significant progress, there are still opportunities for improvement:
1. **Special models**: All LLMs come with the aforementioned improvements. Some specific set of features might not (some quantizations, speculation or VLMs for instance are harder to optimize for with the same level of detail).
2. **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisionned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
3. **Multimodal models**: We are also investigating quite a lot other kind of models, like audio-to-audio, image/video generation, and other hybrids, where we see a lot of potential of applying the same principles we've applied in TGI to maximize performance.
By sharing our benchmarking methodology, results, and technical insights, we aim to contribute to the ongoing development of more efficient and effective LLMs.

View File

@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model --quantize bitsandbytes
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes
```
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model --quantize bitsandbytes-nf4
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes-nf4
```
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model --quantize gptq
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize gptq
```
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.

View File

@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video \
--ipc=host --shm-size 256g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1-rocm \
ghcr.io/huggingface/text-generation-inference:3.0.1-rocm \
--model-id $model
```

View File

@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1-intel-xpu \
ghcr.io/huggingface/text-generation-inference:3.0.1-intel-xpu \
--model-id $model --cuda-graphs 0
```
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1-intel-cpu \
ghcr.io/huggingface/text-generation-inference:3.0.1-intel-cpu \
--model-id $model --cuda-graphs 0
```

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1 \
ghcr.io/huggingface/text-generation-inference:3.0.1 \
--model-id $model
```

View File

@ -0,0 +1,13 @@
# Multi-backend support
TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
With multi-backend support, you can choose the backend that best suits your needs,
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
TGI remains consistent across backends, allowing you to switch between them seamlessly.
**Supported backends:**
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
It utilizes specialized optimizations and custom kernels for enhanced performance.
However, it requires a model-specific compilation step for each GPU architecture.

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1 \
ghcr.io/huggingface/text-generation-inference:3.0.1 \
--model-id $model
```
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
```bash
docker run ghcr.io/huggingface/text-generation-inference:2.4.1 --help
docker run ghcr.io/huggingface/text-generation-inference:3.0.1 --help
```
</Tip>

View File

@ -163,7 +163,7 @@ hub = {
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface",version="2.4.1"),
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.1"),
env=hub,
role=role,
)

View File

@ -467,6 +467,16 @@ Options:
[env: PAYLOAD_LIMIT=]
[default: 2000000]
```
## ENABLE_PREFILL_LOGPROBS
```shell
--enable-prefill-logprobs
Enables prefill logprobs
Logprobs in the prompt are deactivated by default because they consume a large amount of VRAM (especially for long prompts). Using this flag reallows users to ask for them.
[env: ENABLE_PREFILL_LOGPROBS=]
```
## HELP
```shell

View File

@ -0,0 +1,22 @@
import os
import json
for root, dirs, files in os.walk("."):
for filename in files:
if filename.endswith(".json"):
with open(os.path.join(root, filename), "r") as f:
data = json.load(f)
print(os.path.join(root, filename))
try:
if filename.endswith("_load.json"):
for i in range(len(data)):
data[i]["details"]["prefill"] = []
else:
data["details"]["prefill"] = []
except Exception:
pass
with open(os.path.join(root, filename), "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.4902344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.4902344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.4902344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.4902344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -6.3867188,
"text": "What"
},
{
"id": 374,
"logprob": -1.1318359,
"text": " is"
},
{
"id": 5655,
"logprob": -9.6875,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.3007812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.4902344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
},
{
"id": 30,
"logprob": -1.5,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,28 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,33 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
},
{
"id": 30,
"logprob": -1.5,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -103,33 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
},
{
"id": 30,
"logprob": -1.5,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -202,33 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
},
{
"id": 30,
"logprob": -1.5,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -301,33 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 3838,
"logprob": null,
"text": "What"
},
{
"id": 374,
"logprob": -8.59375,
"text": " is"
},
{
"id": 5538,
"logprob": -10.921875,
"text": " deep"
},
{
"id": 6832,
"logprob": -0.56347656,
"text": " learning"
},
{
"id": 30,
"logprob": -1.5,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5390625,
"text": "What"
},
{
"id": 374,
"logprob": -0.86035156,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4912109,
"text": " learning"
},
{
"id": 30,
"logprob": -2.1152344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5390625,
"text": "What"
},
{
"id": 374,
"logprob": -0.86035156,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4912109,
"text": " learning"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5390625,
"text": "What"
},
{
"id": 374,
"logprob": -0.86035156,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4912109,
"text": " learning"
},
{
"id": 30,
"logprob": -2.1152344,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5351562,
"text": "What"
},
{
"id": 374,
"logprob": -0.85791016,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4882812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.1210938,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5351562,
"text": "What"
},
{
"id": 374,
"logprob": -0.85791016,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4882812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.1210938,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.5351562,
"text": "What"
},
{
"id": 374,
"logprob": -0.85791016,
"text": " is"
},
{
"id": 5655,
"logprob": -8.828125,
"text": " deep"
},
{
"id": 6975,
"logprob": -1.4882812,
"text": " learning"
},
{
"id": 30,
"logprob": -2.1210938,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.703125,
"text": "What"
},
{
"id": 338,
"logprob": -1.4765625,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8583984,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7548828,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 338,
"logprob": -9.0859375,
"text": "is"
},
{
"id": 21784,
"logprob": -10.90625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -2.65625,
"text": "Learning"
},
{
"id": 29973,
"logprob": -4.8085938,
"text": "?"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.703125,
"text": "What"
},
{
"id": 338,
"logprob": -1.4765625,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8652344,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7548828,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.703125,
"text": "What"
},
{
"id": 338,
"logprob": -1.4765625,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8583984,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7548828,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.703125,
"text": "What"
},
{
"id": 338,
"logprob": -1.4765625,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8652344,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7548828,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.703125,
"text": "What"
},
{
"id": 338,
"logprob": -1.4765625,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8652344,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7548828,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -4,38 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.6914062,
"text": "What"
},
{
"id": 338,
"logprob": -1.4746094,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8623047,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7558594,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -108,38 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.6914062,
"text": "What"
},
{
"id": 338,
"logprob": -1.4746094,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8623047,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7558594,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -212,38 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.6914062,
"text": "What"
},
{
"id": 338,
"logprob": -1.4746094,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8623047,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7558594,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -316,38 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.6914062,
"text": "What"
},
{
"id": 338,
"logprob": -1.4746094,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8623047,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7558594,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,38 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1724,
"logprob": -7.6914062,
"text": "What"
},
{
"id": 338,
"logprob": -1.4746094,
"text": "is"
},
{
"id": 21784,
"logprob": -9.390625,
"text": "Deep"
},
{
"id": 29257,
"logprob": -1.8623047,
"text": "Learning"
},
{
"id": 29973,
"logprob": -0.7558594,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 4,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,23 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -93,23 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -182,23 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -271,23 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 100000,
"logprob": null,
"text": "<begin▁of▁sentence>"
},
{
"id": 3533,
"logprob": -9.625,
"text": "Test"
},
{
"id": 3102,
"logprob": -11.25,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,313 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 50,
"logprob": null,
"text": "G"
},
{
"id": 330,
"logprob": -5.96875,
"text": "ir"
},
{
"id": 1622,
"logprob": -5.6132812,
"text": "af"
},
{
"id": 249,
"logprob": -6.5039062,
"text": "at"
},
{
"id": 1480,
"logprob": -8.078125,
"text": "ron"
},
{
"id": 304,
"logprob": -2.3261719,
"text": " is"
},
{
"id": 23866,
"logprob": -9.59375,
"text": " obsessed"
},
{
"id": 335,
"logprob": -0.048339844,
"text": " with"
},
{
"id": 26680,
"logprob": -4.0,
"text": " gir"
},
{
"id": 1903,
"logprob": -0.07556152,
"text": "aff"
},
{
"id": 255,
"logprob": -0.0067749023,
"text": "es"
},
{
"id": 23,
"logprob": -1.546875,
"text": ","
},
{
"id": 248,
"logprob": -4.3320312,
"text": " the"
},
{
"id": 758,
"logprob": -3.734375,
"text": " most"
},
{
"id": 21735,
"logprob": -5.109375,
"text": " glorious"
},
{
"id": 5985,
"logprob": -2.09375,
"text": " animal"
},
{
"id": 313,
"logprob": -1.1835938,
"text": " on"
},
{
"id": 248,
"logprob": -0.77685547,
"text": " the"
},
{
"id": 1936,
"logprob": -2.3828125,
"text": " face"
},
{
"id": 275,
"logprob": -0.004432678,
"text": " of"
},
{
"id": 414,
"logprob": -1.9677734,
"text": " this"
},
{
"id": 6490,
"logprob": -2.046875,
"text": " Earth"
},
{
"id": 25,
"logprob": -0.28198242,
"text": "."
},
{
"id": 401,
"logprob": -7.9179688,
"text": " G"
},
{
"id": 6013,
"logprob": -2.2753906,
"text": "ira"
},
{
"id": 694,
"logprob": -0.6230469,
"text": "ft"
},
{
"id": 1480,
"logprob": -0.20874023,
"text": "ron"
},
{
"id": 9369,
"logprob": -4.5507812,
"text": " believes"
},
{
"id": 455,
"logprob": -4.5664062,
"text": " all"
},
{
"id": 599,
"logprob": -2.7402344,
"text": " other"
},
{
"id": 5632,
"logprob": -0.21948242,
"text": " animals"
},
{
"id": 362,
"logprob": -0.7675781,
"text": " are"
},
{
"id": 23981,
"logprob": -5.0,
"text": " irrelevant"
},
{
"id": 635,
"logprob": -4.234375,
"text": " when"
},
{
"id": 4354,
"logprob": -0.5131836,
"text": " compared"
},
{
"id": 271,
"logprob": -0.103637695,
"text": " to"
},
{
"id": 248,
"logprob": -0.58447266,
"text": " the"
},
{
"id": 21735,
"logprob": -3.6835938,
"text": " glorious"
},
{
"id": 64398,
"logprob": -1.8173828,
"text": " majesty"
},
{
"id": 275,
"logprob": -0.23510742,
"text": " of"
},
{
"id": 248,
"logprob": -0.35473633,
"text": " the"
},
{
"id": 26680,
"logprob": -0.24633789,
"text": " gir"
},
{
"id": 23226,
"logprob": -0.02960205,
"text": "affe"
},
{
"id": 25,
"logprob": -0.17333984,
"text": "."
},
{
"id": 193,
"logprob": -1.3935547,
"text": "\n"
},
{
"id": 23626,
"logprob": -10.0625,
"text": "Daniel"
},
{
"id": 37,
"logprob": -4.59375,
"text": ":"
},
{
"id": 23090,
"logprob": -6.9375,
"text": " Hello"
},
{
"id": 23,
"logprob": -0.99365234,
"text": ","
},
{
"id": 29033,
"logprob": -2.2324219,
"text": " Gir"
},
{
"id": 1622,
"logprob": -0.10809326,
"text": "af"
},
{
"id": 249,
"logprob": -0.042663574,
"text": "at"
},
{
"id": 1480,
"logprob": -0.0024776459,
"text": "ron"
},
{
"id": 12,
"logprob": -1.4277344,
"text": "!"
},
{
"id": 193,
"logprob": -1.1015625,
"text": "\n"
},
{
"id": 50,
"logprob": -0.05709839,
"text": "G"
},
{
"id": 330,
"logprob": -0.13208008,
"text": "ir"
},
{
"id": 1622,
"logprob": -0.0071487427,
"text": "af"
},
{
"id": 249,
"logprob": -0.008468628,
"text": "at"
},
{
"id": 1480,
"logprob": -0.00068998337,
"text": "ron"
},
{
"id": 37,
"logprob": -0.0074691772,
"text": ":"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 330,
"logprob": null,
"text": "ir"
},
{
"id": 1622,
"logprob": -7.8125,
"text": "af"
},
{
"id": 249,
"logprob": -4.5,
"text": "at"
},
{
"id": 1480,
"logprob": -10.875,
"text": "ron"
},
{
"id": 37,
"logprob": -3.6875,
"text": ":"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0625,
"text": "Test"
},
{
"id": 3853,
"logprob": -11.0,
"text": " request"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,23 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.875,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -93,23 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.875,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -182,23 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.875,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -271,23 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.875,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -10.0625,
"text": "Test"
},
{
"id": 3853,
"logprob": -11.0,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,188 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 106,
"logprob": -47.25,
"text": "<start_of_turn>"
},
{
"id": 1645,
"logprob": -18.875,
"text": "user"
},
{
"id": 235292,
"logprob": -7.15625,
"text": ":"
},
{
"id": 108,
"logprob": -4.78125,
"text": "\n"
},
{
"id": 5559,
"logprob": -10.0,
"text": "Write"
},
{
"id": 476,
"logprob": -0.1171875,
"text": " a"
},
{
"id": 19592,
"logprob": -2.46875,
"text": " poem"
},
{
"id": 577,
"logprob": -5.84375,
"text": " to"
},
{
"id": 1707,
"logprob": -6.375,
"text": " help"
},
{
"id": 682,
"logprob": -2.125,
"text": " me"
},
{
"id": 5434,
"logprob": -1.546875,
"text": " remember"
},
{
"id": 573,
"logprob": -0.62890625,
"text": " the"
},
{
"id": 1370,
"logprob": -6.65625,
"text": " first"
},
{
"id": 235248,
"logprob": -1.84375,
"text": " "
},
{
"id": 235274,
"logprob": -0.45117188,
"text": "1"
},
{
"id": 235276,
"logprob": -0.07421875,
"text": "0"
},
{
"id": 6635,
"logprob": -2.109375,
"text": " elements"
},
{
"id": 611,
"logprob": -0.4140625,
"text": " on"
},
{
"id": 573,
"logprob": -0.0009536743,
"text": " the"
},
{
"id": 26163,
"logprob": -0.033203125,
"text": " periodic"
},
{
"id": 3037,
"logprob": -0.0002670288,
"text": " table"
},
{
"id": 235269,
"logprob": -4.75,
"text": ","
},
{
"id": 7385,
"logprob": -11.625,
"text": " giving"
},
{
"id": 1853,
"logprob": -4.875,
"text": " each"
},
{
"id": 5356,
"logprob": -0.38867188,
"text": " element"
},
{
"id": 1277,
"logprob": -3.65625,
"text": " its"
},
{
"id": 1997,
"logprob": -4.4375,
"text": " own"
},
{
"id": 2017,
"logprob": -0.29882812,
"text": " line"
},
{
"id": 235265,
"logprob": -0.16699219,
"text": "."
},
{
"id": 107,
"logprob": -25.625,
"text": "<end_of_turn>"
},
{
"id": 108,
"logprob": -6.75,
"text": "\n"
},
{
"id": 106,
"logprob": -39.5,
"text": "<start_of_turn>"
},
{
"id": 2516,
"logprob": -32.5,
"text": "model"
},
{
"id": 235292,
"logprob": -10.125,
"text": ":"
},
{
"id": 108,
"logprob": -3.421875,
"text": "\n"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -4,188 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 106,
"logprob": -47.25,
"text": "<start_of_turn>"
},
{
"id": 1645,
"logprob": -18.875,
"text": "user"
},
{
"id": 235292,
"logprob": -7.25,
"text": ":"
},
{
"id": 108,
"logprob": -4.78125,
"text": "\n"
},
{
"id": 5559,
"logprob": -10.0,
"text": "Write"
},
{
"id": 476,
"logprob": -0.111816406,
"text": " a"
},
{
"id": 19592,
"logprob": -2.46875,
"text": " poem"
},
{
"id": 577,
"logprob": -5.78125,
"text": " to"
},
{
"id": 1707,
"logprob": -6.375,
"text": " help"
},
{
"id": 682,
"logprob": -2.125,
"text": " me"
},
{
"id": 5434,
"logprob": -1.59375,
"text": " remember"
},
{
"id": 573,
"logprob": -0.62890625,
"text": " the"
},
{
"id": 1370,
"logprob": -6.625,
"text": " first"
},
{
"id": 235248,
"logprob": -1.7421875,
"text": " "
},
{
"id": 235274,
"logprob": -0.44921875,
"text": "1"
},
{
"id": 235276,
"logprob": -0.07128906,
"text": "0"
},
{
"id": 6635,
"logprob": -2.109375,
"text": " elements"
},
{
"id": 611,
"logprob": -0.40429688,
"text": " on"
},
{
"id": 573,
"logprob": -0.0009918213,
"text": " the"
},
{
"id": 26163,
"logprob": -0.03540039,
"text": " periodic"
},
{
"id": 3037,
"logprob": -0.00028800964,
"text": " table"
},
{
"id": 235269,
"logprob": -4.71875,
"text": ","
},
{
"id": 7385,
"logprob": -11.875,
"text": " giving"
},
{
"id": 1853,
"logprob": -4.875,
"text": " each"
},
{
"id": 5356,
"logprob": -0.38867188,
"text": " element"
},
{
"id": 1277,
"logprob": -3.65625,
"text": " its"
},
{
"id": 1997,
"logprob": -4.4375,
"text": " own"
},
{
"id": 2017,
"logprob": -0.3046875,
"text": " line"
},
{
"id": 235265,
"logprob": -0.16113281,
"text": "."
},
{
"id": 107,
"logprob": -25.625,
"text": "<end_of_turn>"
},
{
"id": 108,
"logprob": -6.75,
"text": "\n"
},
{
"id": 106,
"logprob": -39.25,
"text": "<start_of_turn>"
},
{
"id": 2516,
"logprob": -32.5,
"text": "model"
},
{
"id": 235292,
"logprob": -10.1875,
"text": ":"
},
{
"id": 108,
"logprob": -3.296875,
"text": "\n"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -258,188 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 106,
"logprob": -47.25,
"text": "<start_of_turn>"
},
{
"id": 1645,
"logprob": -18.875,
"text": "user"
},
{
"id": 235292,
"logprob": -7.25,
"text": ":"
},
{
"id": 108,
"logprob": -4.78125,
"text": "\n"
},
{
"id": 5559,
"logprob": -10.0,
"text": "Write"
},
{
"id": 476,
"logprob": -0.111816406,
"text": " a"
},
{
"id": 19592,
"logprob": -2.46875,
"text": " poem"
},
{
"id": 577,
"logprob": -5.78125,
"text": " to"
},
{
"id": 1707,
"logprob": -6.375,
"text": " help"
},
{
"id": 682,
"logprob": -2.125,
"text": " me"
},
{
"id": 5434,
"logprob": -1.59375,
"text": " remember"
},
{
"id": 573,
"logprob": -0.62890625,
"text": " the"
},
{
"id": 1370,
"logprob": -6.625,
"text": " first"
},
{
"id": 235248,
"logprob": -1.7421875,
"text": " "
},
{
"id": 235274,
"logprob": -0.44921875,
"text": "1"
},
{
"id": 235276,
"logprob": -0.07128906,
"text": "0"
},
{
"id": 6635,
"logprob": -2.109375,
"text": " elements"
},
{
"id": 611,
"logprob": -0.40429688,
"text": " on"
},
{
"id": 573,
"logprob": -0.0009918213,
"text": " the"
},
{
"id": 26163,
"logprob": -0.03540039,
"text": " periodic"
},
{
"id": 3037,
"logprob": -0.00028800964,
"text": " table"
},
{
"id": 235269,
"logprob": -4.71875,
"text": ","
},
{
"id": 7385,
"logprob": -11.875,
"text": " giving"
},
{
"id": 1853,
"logprob": -4.875,
"text": " each"
},
{
"id": 5356,
"logprob": -0.38867188,
"text": " element"
},
{
"id": 1277,
"logprob": -3.65625,
"text": " its"
},
{
"id": 1997,
"logprob": -4.4375,
"text": " own"
},
{
"id": 2017,
"logprob": -0.3046875,
"text": " line"
},
{
"id": 235265,
"logprob": -0.16113281,
"text": "."
},
{
"id": 107,
"logprob": -25.625,
"text": "<end_of_turn>"
},
{
"id": 108,
"logprob": -6.75,
"text": "\n"
},
{
"id": 106,
"logprob": -39.25,
"text": "<start_of_turn>"
},
{
"id": 2516,
"logprob": -32.5,
"text": "model"
},
{
"id": 235292,
"logprob": -10.1875,
"text": ":"
},
{
"id": 108,
"logprob": -3.296875,
"text": "\n"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -512,188 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 106,
"logprob": -47.25,
"text": "<start_of_turn>"
},
{
"id": 1645,
"logprob": -18.875,
"text": "user"
},
{
"id": 235292,
"logprob": -7.15625,
"text": ":"
},
{
"id": 108,
"logprob": -4.78125,
"text": "\n"
},
{
"id": 5559,
"logprob": -10.0,
"text": "Write"
},
{
"id": 476,
"logprob": -0.1171875,
"text": " a"
},
{
"id": 19592,
"logprob": -2.46875,
"text": " poem"
},
{
"id": 577,
"logprob": -5.84375,
"text": " to"
},
{
"id": 1707,
"logprob": -6.375,
"text": " help"
},
{
"id": 682,
"logprob": -2.125,
"text": " me"
},
{
"id": 5434,
"logprob": -1.546875,
"text": " remember"
},
{
"id": 573,
"logprob": -0.62890625,
"text": " the"
},
{
"id": 1370,
"logprob": -6.65625,
"text": " first"
},
{
"id": 235248,
"logprob": -1.84375,
"text": " "
},
{
"id": 235274,
"logprob": -0.45117188,
"text": "1"
},
{
"id": 235276,
"logprob": -0.07421875,
"text": "0"
},
{
"id": 6635,
"logprob": -2.109375,
"text": " elements"
},
{
"id": 611,
"logprob": -0.4140625,
"text": " on"
},
{
"id": 573,
"logprob": -0.0009536743,
"text": " the"
},
{
"id": 26163,
"logprob": -0.033203125,
"text": " periodic"
},
{
"id": 3037,
"logprob": -0.0002670288,
"text": " table"
},
{
"id": 235269,
"logprob": -4.75,
"text": ","
},
{
"id": 7385,
"logprob": -11.625,
"text": " giving"
},
{
"id": 1853,
"logprob": -4.875,
"text": " each"
},
{
"id": 5356,
"logprob": -0.38867188,
"text": " element"
},
{
"id": 1277,
"logprob": -3.65625,
"text": " its"
},
{
"id": 1997,
"logprob": -4.4375,
"text": " own"
},
{
"id": 2017,
"logprob": -0.29882812,
"text": " line"
},
{
"id": 235265,
"logprob": -0.16699219,
"text": "."
},
{
"id": 107,
"logprob": -25.625,
"text": "<end_of_turn>"
},
{
"id": 108,
"logprob": -6.75,
"text": "\n"
},
{
"id": 106,
"logprob": -39.5,
"text": "<start_of_turn>"
},
{
"id": 2516,
"logprob": -32.5,
"text": "model"
},
{
"id": 235292,
"logprob": -10.125,
"text": ":"
},
{
"id": 108,
"logprob": -3.421875,
"text": "\n"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -766,188 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 106,
"logprob": -47.25,
"text": "<start_of_turn>"
},
{
"id": 1645,
"logprob": -18.875,
"text": "user"
},
{
"id": 235292,
"logprob": -7.25,
"text": ":"
},
{
"id": 108,
"logprob": -4.78125,
"text": "\n"
},
{
"id": 5559,
"logprob": -10.0,
"text": "Write"
},
{
"id": 476,
"logprob": -0.111816406,
"text": " a"
},
{
"id": 19592,
"logprob": -2.46875,
"text": " poem"
},
{
"id": 577,
"logprob": -5.78125,
"text": " to"
},
{
"id": 1707,
"logprob": -6.375,
"text": " help"
},
{
"id": 682,
"logprob": -2.125,
"text": " me"
},
{
"id": 5434,
"logprob": -1.59375,
"text": " remember"
},
{
"id": 573,
"logprob": -0.62890625,
"text": " the"
},
{
"id": 1370,
"logprob": -6.625,
"text": " first"
},
{
"id": 235248,
"logprob": -1.7421875,
"text": " "
},
{
"id": 235274,
"logprob": -0.44921875,
"text": "1"
},
{
"id": 235276,
"logprob": -0.07128906,
"text": "0"
},
{
"id": 6635,
"logprob": -2.109375,
"text": " elements"
},
{
"id": 611,
"logprob": -0.40429688,
"text": " on"
},
{
"id": 573,
"logprob": -0.0009918213,
"text": " the"
},
{
"id": 26163,
"logprob": -0.03540039,
"text": " periodic"
},
{
"id": 3037,
"logprob": -0.00028800964,
"text": " table"
},
{
"id": 235269,
"logprob": -4.71875,
"text": ","
},
{
"id": 7385,
"logprob": -11.875,
"text": " giving"
},
{
"id": 1853,
"logprob": -4.875,
"text": " each"
},
{
"id": 5356,
"logprob": -0.38867188,
"text": " element"
},
{
"id": 1277,
"logprob": -3.65625,
"text": " its"
},
{
"id": 1997,
"logprob": -4.4375,
"text": " own"
},
{
"id": 2017,
"logprob": -0.3046875,
"text": " line"
},
{
"id": 235265,
"logprob": -0.16113281,
"text": "."
},
{
"id": 107,
"logprob": -25.625,
"text": "<end_of_turn>"
},
{
"id": 108,
"logprob": -6.75,
"text": "\n"
},
{
"id": 106,
"logprob": -39.25,
"text": "<start_of_turn>"
},
{
"id": 2516,
"logprob": -32.5,
"text": "model"
},
{
"id": 235292,
"logprob": -10.1875,
"text": ":"
},
{
"id": 108,
"logprob": -3.296875,
"text": "\n"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.640625,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.34375,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.6484375,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.3671875,
"text": " request"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,23 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.6484375,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.359375,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -93,23 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.6484375,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.34375,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -182,23 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.640625,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.3671875,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -271,23 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 2015,
"logprob": -9.6484375,
"text": "Test"
},
{
"id": 3853,
"logprob": -10.359375,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,33 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2061,
"logprob": null,
"text": "What"
},
{
"id": 318,
"logprob": -3.1835938,
"text": " is"
},
{
"id": 2769,
"logprob": -9.171875,
"text": " deep"
},
{
"id": 4673,
"logprob": -1.6425781,
"text": " learning"
},
{
"id": 30,
"logprob": -0.7314453,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -4,33 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2061,
"logprob": null,
"text": "What"
},
{
"id": 318,
"logprob": -3.1835938,
"text": " is"
},
{
"id": 2769,
"logprob": -9.171875,
"text": " deep"
},
{
"id": 4673,
"logprob": -1.6425781,
"text": " learning"
},
{
"id": 30,
"logprob": -0.7314453,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -103,33 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2061,
"logprob": null,
"text": "What"
},
{
"id": 318,
"logprob": -3.1660156,
"text": " is"
},
{
"id": 2769,
"logprob": -9.1796875,
"text": " deep"
},
{
"id": 4673,
"logprob": -1.6376953,
"text": " learning"
},
{
"id": 30,
"logprob": -0.72216797,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -202,33 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2061,
"logprob": null,
"text": "What"
},
{
"id": 318,
"logprob": -3.1660156,
"text": " is"
},
{
"id": 2769,
"logprob": -9.1796875,
"text": " deep"
},
{
"id": 4673,
"logprob": -1.6376953,
"text": " learning"
},
{
"id": 30,
"logprob": -0.72216797,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -301,33 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2061,
"logprob": null,
"text": "What"
},
{
"id": 318,
"logprob": -3.1660156,
"text": " is"
},
{
"id": 2769,
"logprob": -9.1796875,
"text": " deep"
},
{
"id": 4673,
"logprob": -1.6376953,
"text": " learning"
},
{
"id": 30,
"logprob": -0.72216797,
"text": "?"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -13.90625,
"text": "Test"
},
{
"id": 2009,
"logprob": -12.328125,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,88 +3,7 @@
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 30,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 5235,
"logprob": -10.0625,
"text": "info"
},
{
"id": 29901,
"logprob": -3.2324219,
"text": ":"
},
{
"id": 13260,
"logprob": -10.625,
"text": "dav"
},
{
"id": 333,
"logprob": -0.08276367,
"text": "id"
},
{
"id": 8753,
"logprob": -7.5273438,
"text": "hol"
},
{
"id": 17559,
"logprob": -3.8476562,
"text": "tz"
},
{
"id": 763,
"logprob": -10.140625,
"text": "like"
},
{
"id": 10697,
"logprob": -10.1953125,
"text": "trees"
},
{
"id": 322,
"logprob": -2.5742188,
"text": "and"
},
{
"id": 756,
"logprob": -7.4882812,
"text": "has"
},
{
"id": 1023,
"logprob": -5.0507812,
"text": "two"
},
{
"id": 274,
"logprob": -5.3164062,
"text": "c"
},
{
"id": 1446,
"logprob": -0.6694336,
"text": "ats"
},
{
"id": 29889,
"logprob": -0.9995117,
"text": "."
},
{
"id": 29871,
"logprob": -4.2421875,
"text": ""
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -4,53 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1024,
"logprob": -10.578125,
"text": "name"
},
{
"id": 29901,
"logprob": -3.0332031,
"text": ":"
},
{
"id": 13260,
"logprob": -9.171875,
"text": "dav"
},
{
"id": 333,
"logprob": -0.04257202,
"text": "id"
},
{
"id": 29889,
"logprob": -2.4785156,
"text": "."
},
{
"id": 4876,
"logprob": -10.7890625,
"text": "email"
},
{
"id": 29901,
"logprob": -0.32495117,
"text": ":"
},
{
"id": 259,
"logprob": -9.4921875,
"text": " "
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -123,53 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1024,
"logprob": -10.578125,
"text": "name"
},
{
"id": 29901,
"logprob": -3.03125,
"text": ":"
},
{
"id": 13260,
"logprob": -9.171875,
"text": "dav"
},
{
"id": 333,
"logprob": -0.04244995,
"text": "id"
},
{
"id": 29889,
"logprob": -2.4863281,
"text": "."
},
{
"id": 4876,
"logprob": -10.7890625,
"text": "email"
},
{
"id": 29901,
"logprob": -0.32714844,
"text": ":"
},
{
"id": 259,
"logprob": -9.4921875,
"text": " "
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -242,53 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1024,
"logprob": -10.578125,
"text": "name"
},
{
"id": 29901,
"logprob": -3.0332031,
"text": ":"
},
{
"id": 13260,
"logprob": -9.171875,
"text": "dav"
},
{
"id": 333,
"logprob": -0.04257202,
"text": "id"
},
{
"id": 29889,
"logprob": -2.4785156,
"text": "."
},
{
"id": 4876,
"logprob": -10.7890625,
"text": "email"
},
{
"id": 29901,
"logprob": -0.32495117,
"text": ":"
},
{
"id": 259,
"logprob": -9.4921875,
"text": " "
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -361,53 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 1024,
"logprob": -10.578125,
"text": "name"
},
{
"id": 29901,
"logprob": -3.0332031,
"text": ":"
},
{
"id": 13260,
"logprob": -9.171875,
"text": "dav"
},
{
"id": 333,
"logprob": -0.04257202,
"text": "id"
},
{
"id": 29889,
"logprob": -2.4785156,
"text": "."
},
{
"id": 4876,
"logprob": -10.7890625,
"text": "email"
},
{
"id": 29901,
"logprob": -0.32495117,
"text": ":"
},
{
"id": 259,
"logprob": -9.4921875,
"text": " "
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,43 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 806,
"logprob": -11.890625,
"text": "Wh"
},
{
"id": 1446,
"logprob": -3.6699219,
"text": "ats"
},
{
"id": 2921,
"logprob": -7.8203125,
"text": "Go"
},
{
"id": 468,
"logprob": -8.0703125,
"text": "og"
},
{
"id": 793,
"logprob": -2.1875,
"text": "les"
},
{
"id": 16332,
"logprob": -9.7109375,
"text": "DNS"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "stop_sequence",
"generated_tokens": 5,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

View File

@ -4,23 +4,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -93,23 +77,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -182,23 +150,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{
@ -271,23 +223,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,23 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.546875,
"text": "request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,18 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2323,
"logprob": null,
"text": "Test"
},
{
"id": 1715,
"logprob": -11.4375,
"text": " request"
}
],
"prefill": [],
"seed": null,
"tokens": [
{

View File

@ -3,18 +3,7 @@
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2323,
"logprob": null,
"text": "Test"
},
{
"id": 1715,
"logprob": -11.453125,
"text": " request"
}
],
"prefill": [],
"seed": 0,
"tokens": [
{

Some files were not shown because too many files have changed in this diff Show More