add initial Dockerfile for TRTLLM backend

This commit is contained in:
Morgan Funtowicz 2024-07-19 22:08:12 +00:00
parent 6300bab8b4
commit d5464d2f80
5 changed files with 137 additions and 10 deletions

76
Cargo.lock generated
View File

@ -2278,6 +2278,20 @@ dependencies = [
"urlencoding",
]
[[package]]
name = "opentelemetry"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.13.0"
@ -2377,6 +2391,26 @@ dependencies = [
"thiserror",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd"
dependencies = [
"async-trait",
"futures-channel",
"futures-executor",
"futures-util",
"glob",
"lazy_static",
"once_cell",
"opentelemetry 0.23.0",
"ordered-float 4.2.0",
"percent-encoding",
"rand",
"thiserror",
]
[[package]]
name = "option-ext"
version = "0.2.0"
@ -3464,11 +3498,15 @@ dependencies = [
"cmake",
"cxx",
"cxx-build",
"log",
"text-generation-router",
"thiserror",
"tokenizers",
"tokio",
"tokio-stream",
"tracing",
"tracing-opentelemetry 0.24.0",
"tracing-subscriber",
]
[[package]]
@ -3583,18 +3621,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.61"
version = "1.0.62"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.61"
version = "1.0.62"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c"
dependencies = [
"proc-macro2",
"quote",
@ -4025,7 +4063,25 @@ dependencies = [
"tracing-core",
"tracing-log 0.2.0",
"tracing-subscriber",
"web-time",
"web-time 0.2.4",
]
[[package]]
name = "tracing-opentelemetry"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4"
dependencies = [
"js-sys",
"once_cell",
"opentelemetry 0.23.0",
"opentelemetry_sdk 0.23.0",
"smallvec",
"tracing",
"tracing-core",
"tracing-log 0.2.0",
"tracing-subscriber",
"web-time 1.1.0",
]
[[package]]
@ -4404,6 +4460,16 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "webpki"
version = "0.22.4"

View File

@ -9,9 +9,9 @@ include(ExternalProject)
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
#### External dependencies ####
include(cmake/fmt.cmake)

View File

@ -0,0 +1,49 @@
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest as chef
WORKDIR /usr/src/text-generation-inference
FROM chef as planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
# CUDA dependent dependencies resolver stage
FROM nvcr.io/nvidia/pytorch:24.05-py3 as cuda-builder
RUN apt update && apt install -y \
cmake \
gcc \
g++ \
git \
git-lfs \
ninja-build
# Install TensorRT
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
/opt/install_tensorrt.sh
# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
chmod -R a+w $HOME/.rustup && \
chmod -R a+w $HOME/.cargo
ENV PATH="$HOME/.cargo/bin:$PATH"
RUN $HOME/.cargo/bin/cargo install cargo-chef
# Backend build step
WORKDIR /usr/src/text-generation-inference
# Cache dependencies
COPY --from=planner /usr/src/text-generation-inference/recipe.json .
RUN $HOME/.cargo/bin/cargo chef cook --release --recipe-path recipe.json
# Build actual TGI
COPY . .
RUN $HOME/.cargo/bin/cargo build --release --bin text-generation-backends-trtllm
FROM nvcr.io/nvidia/pytorch:24.05-py3
WORKDIR /opt
COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=cuda-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /opt/text-generation-launcher

View File

@ -4,6 +4,8 @@ use std::path::PathBuf;
use cxx_build::CFG;
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
fn main() {
// Misc variables
@ -22,7 +24,15 @@ fn main() {
true => "Debug",
false => "Release",
})
.define(
"TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST",
CUDA_ARCH_LIST.unwrap_or("90-real"), // Hopper by default
)
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
.define(
"TGI_TRTLLM_BACKEND_TRT_ROOT",
TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt"),
)
.build();
// Additional transitive CMake dependencies

View File

@ -1,3 +1,6 @@
set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
set(USE_CXX11_ABI ON)
set(NVTX_DISABLE OFF)
set(BUILD_PYT OFF)
@ -5,8 +8,6 @@ set(BUILD_PYBIND OFF)
set(BUILD_MICRO_BENCHMARKS OFF)
set(BUILD_BENCHMARKS OFF)
set(BUILD_TESTS OFF)
set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -15,6 +16,7 @@ if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(FAST_BUILD ON)
else ()
set(FAST_BUILD OFF)
set(FAST_MATH ON)
endif ()
fetchcontent_declare(