mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
add initial Dockerfile for TRTLLM backend
This commit is contained in:
parent
6300bab8b4
commit
d5464d2f80
76
Cargo.lock
generated
76
Cargo.lock
generated
@ -2278,6 +2278,20 @@ dependencies = [
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.23.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.13.0"
|
||||
@ -2377,6 +2391,26 @@ dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.23.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"lazy_static",
|
||||
"once_cell",
|
||||
"opentelemetry 0.23.0",
|
||||
"ordered-float 4.2.0",
|
||||
"percent-encoding",
|
||||
"rand",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
@ -3464,11 +3498,15 @@ dependencies = [
|
||||
"cmake",
|
||||
"cxx",
|
||||
"cxx-build",
|
||||
"log",
|
||||
"text-generation-router",
|
||||
"thiserror",
|
||||
"tokenizers",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-opentelemetry 0.24.0",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3583,18 +3621,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.61"
|
||||
version = "1.0.62"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
|
||||
checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.61"
|
||||
version = "1.0.62"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
|
||||
checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@ -4025,7 +4063,25 @@ dependencies = [
|
||||
"tracing-core",
|
||||
"tracing-log 0.2.0",
|
||||
"tracing-subscriber",
|
||||
"web-time",
|
||||
"web-time 0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"opentelemetry 0.23.0",
|
||||
"opentelemetry_sdk 0.23.0",
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log 0.2.0",
|
||||
"tracing-subscriber",
|
||||
"web-time 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -4404,6 +4460,16 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki"
|
||||
version = "0.22.4"
|
||||
|
@ -9,9 +9,9 @@ include(ExternalProject)
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
|
||||
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
|
||||
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
|
||||
|
||||
#### External dependencies ####
|
||||
include(cmake/fmt.cmake)
|
||||
|
49
backends/trtllm/Dockerfile
Normal file
49
backends/trtllm/Dockerfile
Normal file
@ -0,0 +1,49 @@
|
||||
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
|
||||
|
||||
# Build dependencies resolver stage
|
||||
FROM lukemathwalker/cargo-chef:latest as chef
|
||||
WORKDIR /usr/src/text-generation-inference
|
||||
|
||||
FROM chef as planner
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# CUDA dependent dependencies resolver stage
|
||||
FROM nvcr.io/nvidia/pytorch:24.05-py3 as cuda-builder
|
||||
|
||||
RUN apt update && apt install -y \
|
||||
cmake \
|
||||
gcc \
|
||||
g++ \
|
||||
git \
|
||||
git-lfs \
|
||||
ninja-build
|
||||
|
||||
# Install TensorRT
|
||||
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
|
||||
RUN chmod +x /opt/install_tensorrt.sh && \
|
||||
/opt/install_tensorrt.sh
|
||||
|
||||
# Install Rust
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
|
||||
chmod -R a+w $HOME/.rustup && \
|
||||
chmod -R a+w $HOME/.cargo
|
||||
|
||||
ENV PATH="$HOME/.cargo/bin:$PATH"
|
||||
RUN $HOME/.cargo/bin/cargo install cargo-chef
|
||||
|
||||
# Backend build step
|
||||
WORKDIR /usr/src/text-generation-inference
|
||||
|
||||
# Cache dependencies
|
||||
COPY --from=planner /usr/src/text-generation-inference/recipe.json .
|
||||
RUN $HOME/.cargo/bin/cargo chef cook --release --recipe-path recipe.json
|
||||
|
||||
# Build actual TGI
|
||||
COPY . .
|
||||
RUN $HOME/.cargo/bin/cargo build --release --bin text-generation-backends-trtllm
|
||||
|
||||
FROM nvcr.io/nvidia/pytorch:24.05-py3
|
||||
WORKDIR /opt
|
||||
COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||
COPY --from=cuda-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /opt/text-generation-launcher
|
@ -4,6 +4,8 @@ use std::path::PathBuf;
|
||||
use cxx_build::CFG;
|
||||
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
|
||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
|
||||
|
||||
fn main() {
|
||||
// Misc variables
|
||||
@ -22,7 +24,15 @@ fn main() {
|
||||
true => "Debug",
|
||||
false => "Release",
|
||||
})
|
||||
.define(
|
||||
"TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST",
|
||||
CUDA_ARCH_LIST.unwrap_or("90-real"), // Hopper by default
|
||||
)
|
||||
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
|
||||
.define(
|
||||
"TGI_TRTLLM_BACKEND_TRT_ROOT",
|
||||
TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt"),
|
||||
)
|
||||
.build();
|
||||
|
||||
// Additional transitive CMake dependencies
|
||||
|
@ -1,3 +1,6 @@
|
||||
set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
|
||||
|
||||
set(USE_CXX11_ABI ON)
|
||||
set(NVTX_DISABLE OFF)
|
||||
set(BUILD_PYT OFF)
|
||||
@ -5,8 +8,6 @@ set(BUILD_PYBIND OFF)
|
||||
set(BUILD_MICRO_BENCHMARKS OFF)
|
||||
set(BUILD_BENCHMARKS OFF)
|
||||
set(BUILD_TESTS OFF)
|
||||
set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
|
||||
|
||||
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@ -15,6 +16,7 @@ if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
set(FAST_BUILD ON)
|
||||
else ()
|
||||
set(FAST_BUILD OFF)
|
||||
set(FAST_MATH ON)
|
||||
endif ()
|
||||
|
||||
fetchcontent_declare(
|
||||
|
Loading…
Reference in New Issue
Block a user