From d5464d2f8026372ee15a092510fddfa3bc5cd11d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 19 Jul 2024 22:08:12 +0000 Subject: [PATCH] add initial Dockerfile for TRTLLM backend --- Cargo.lock | 76 ++++++++++++++++++++++++++++-- backends/trtllm/CMakeLists.txt | 6 +-- backends/trtllm/Dockerfile | 49 +++++++++++++++++++ backends/trtllm/build.rs | 10 ++++ backends/trtllm/cmake/trtllm.cmake | 6 ++- 5 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 backends/trtllm/Dockerfile diff --git a/Cargo.lock b/Cargo.lock index 30187dff..cbcd1955 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2278,6 +2278,20 @@ dependencies = [ "urlencoding", ] +[[package]] +name = "opentelemetry" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + [[package]] name = "opentelemetry-otlp" version = "0.13.0" @@ -2377,6 +2391,26 @@ dependencies = [ "thiserror", ] +[[package]] +name = "opentelemetry_sdk" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd" +dependencies = [ + "async-trait", + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "lazy_static", + "once_cell", + "opentelemetry 0.23.0", + "ordered-float 4.2.0", + "percent-encoding", + "rand", + "thiserror", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -3464,11 +3498,15 @@ dependencies = [ "cmake", "cxx", "cxx-build", + "log", "text-generation-router", "thiserror", "tokenizers", "tokio", "tokio-stream", + "tracing", + "tracing-opentelemetry 0.24.0", + "tracing-subscriber", ] [[package]] @@ -3583,18 +3621,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c" dependencies = [ "proc-macro2", "quote", @@ -4025,7 +4063,25 @@ dependencies = [ "tracing-core", "tracing-log 0.2.0", "tracing-subscriber", - "web-time", + "web-time 0.2.4", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4" +dependencies = [ + "js-sys", + "once_cell", + "opentelemetry 0.23.0", + "opentelemetry_sdk 0.23.0", + "smallvec", + "tracing", + "tracing-core", + "tracing-log 0.2.0", + "tracing-subscriber", + "web-time 1.1.0", ] [[package]] @@ -4404,6 +4460,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webpki" version = "0.22.4" diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 5eb8c937..f9fe5a41 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -9,9 +9,9 @@ include(ExternalProject) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") -set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located") -set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located") -set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located") +set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located") +set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located") +set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located") #### External dependencies #### include(cmake/fmt.cmake) diff --git a/backends/trtllm/Dockerfile b/backends/trtllm/Dockerfile new file mode 100644 index 00000000..5eb04815 --- /dev/null +++ b/backends/trtllm/Dockerfile @@ -0,0 +1,49 @@ +ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real" + +# Build dependencies resolver stage +FROM lukemathwalker/cargo-chef:latest as chef +WORKDIR /usr/src/text-generation-inference + +FROM chef as planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +# CUDA dependent dependencies resolver stage +FROM nvcr.io/nvidia/pytorch:24.05-py3 as cuda-builder + +RUN apt update && apt install -y \ + cmake \ + gcc \ + g++ \ + git \ + git-lfs \ + ninja-build + +# Install TensorRT +COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh +RUN chmod +x /opt/install_tensorrt.sh && \ + /opt/install_tensorrt.sh + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \ + chmod -R a+w $HOME/.rustup && \ + chmod -R a+w $HOME/.cargo + +ENV PATH="$HOME/.cargo/bin:$PATH" +RUN $HOME/.cargo/bin/cargo install cargo-chef + +# Backend build step +WORKDIR /usr/src/text-generation-inference + +# Cache dependencies +COPY --from=planner /usr/src/text-generation-inference/recipe.json . +RUN $HOME/.cargo/bin/cargo chef cook --release --recipe-path recipe.json + +# Build actual TGI +COPY . . +RUN $HOME/.cargo/bin/cargo build --release --bin text-generation-backends-trtllm + +FROM nvcr.io/nvidia/pytorch:24.05-py3 +WORKDIR /opt +COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt +COPY --from=cuda-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /opt/text-generation-launcher \ No newline at end of file diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index f8f2a2c0..0fa62ec9 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -4,6 +4,8 @@ use std::path::PathBuf; use cxx_build::CFG; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); +const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); fn main() { // Misc variables @@ -22,7 +24,15 @@ fn main() { true => "Debug", false => "Release", }) + .define( + "TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", + CUDA_ARCH_LIST.unwrap_or("90-real"), // Hopper by default + ) .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") + .define( + "TGI_TRTLLM_BACKEND_TRT_ROOT", + TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt"), + ) .build(); // Additional transitive CMake dependencies diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index c3042bb6..f3f1bbea 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -1,3 +1,6 @@ +set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) +set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) + set(USE_CXX11_ABI ON) set(NVTX_DISABLE OFF) set(BUILD_PYT OFF) @@ -5,8 +8,6 @@ set(BUILD_PYBIND OFF) set(BUILD_MICRO_BENCHMARKS OFF) set(BUILD_BENCHMARKS OFF) set(BUILD_TESTS OFF) -set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) -set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST}) message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") @@ -15,6 +16,7 @@ if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") set(FAST_BUILD ON) else () set(FAST_BUILD OFF) + set(FAST_MATH ON) endif () fetchcontent_declare(