diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm index 2a0636ff..14a74c00 100644 --- a/Dockerfile_trtllm +++ b/Dockerfile_trtllm @@ -1,12 +1,14 @@ -ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real" +ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real" +ARG cuda_base=12.8.0 ARG build_type=release ARG ompi_version=4.1.7 ARG sccache_gha_enabled=off ARG actions_cache_url="" ARG actions_runtime_token="" + # CUDA dependent dependencies resolver stage -FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder +FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ build-essential \ @@ -98,14 +100,16 @@ COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi ENV RUSTC_WRAPPER=sccache ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX -RUN export CMAKE_C_COMPILER_LAUNCHER=sccache && \ +RUN export CC=gcc-14 \ + export CXX=g++-14 \ + export CMAKE_C_COMPILER_LAUNCHER=sccache && \ export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \ export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \ mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \ cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \ sccache --show-stats -FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime +FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \ rm -rf /var/lib/{apt,dpkg,cache,log}/ && \ pipx ensurepath && \ @@ -124,7 +128,7 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher # This is used only for the CI/CD -FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime +FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \ rm -rf /var/lib/{apt,dpkg,cache,log}/ && \ pipx ensurepath && \ diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 26af80be..e54fd116 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -59,7 +59,9 @@ target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugi # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back install(TARGETS tgi_trtllm_backend_impl) -install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker) +#install(TARGETS cutlass_src fb_gemm_src fpA_intB_gemm_src gemm_swiglu_sm90_src kernels_src) +install(TARGETS decoder_attention_0 decoder_attention_1) +install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention_src executorWorker) install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB) if (NOT ${TGI_TRTLLM_BACKEND_DEBUG}) install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB) @@ -82,8 +84,9 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug") check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO) if (${COMPILER_SUPPORT_WARNING_ON_NVRO}) message(STATUS "Enabling non-NVRO detection") - target_compile_options(tgi_trtllm_backend_impl "-Wnvro") + target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wnrvo) endif () + target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wall) cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH) message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}") diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 4d559fd4..c9918e2c 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -7,7 +7,7 @@ use std::sync::LazyLock; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"]; const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); -const CUDA_REQUIRED_VERSION: &str = "12.6"; +const CUDA_REQUIRED_VERSION: &str = "12.8"; const MPI_REQUIRED_VERSION: &str = "4.1"; const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX"); const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); @@ -25,11 +25,12 @@ const IS_GHA_BUILD: LazyLock = LazyLock::new(|| { // Dependencies const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl"; const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"]; -const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [ +const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [ ("dylib", "tensorrt_llm"), ("dylib", "tensorrt_llm_nvrtc_wrapper"), ("dylib", "nvinfer_plugin_tensorrt_llm"), - ("dylib", "decoder_attention"), + ("dylib", "decoder_attention_0"), + ("dylib", "decoder_attention_1"), ]; macro_rules! probe { diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index 3e9712c0..95a99e9b 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -28,7 +28,7 @@ find_package(Python3 REQUIRED Interpreter) fetchcontent_declare( trtllm GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git - GIT_TAG v0.16.0 + GIT_TAG v0.17.0 GIT_SHALLOW ON DOWNLOAD_EXTRACT_TIMESTAMP ) diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh index f3e7270a..e09db6b1 100755 --- a/backends/trtllm/scripts/install_tensorrt.sh +++ b/backends/trtllm/scripts/install_tensorrt.sh @@ -2,13 +2,13 @@ set -ex -TRT_VER_BASE="10.7.0" -TRT_VER_FULL="${TRT_VER_BASE}.23" -CUDA_VER="12.6" -CUDNN_VER="9.5.0.50-1" -NCCL_VER="2.22.3-1+cuda12.6" -CUBLAS_VER="12.6.3.3-1" -NVRTC_VER="12.6.77-1" +TRT_VER_BASE="10.8.0" +TRT_VER_FULL="${TRT_VER_BASE}.43" +CUDA_VER="12.8" +CUDNN_VER="9.7.0.66-1" +NCCL_VER="2.25.1-1+cuda${CUDA_VER}" +CUBLAS_VER="${CUDA_VER}.3.14-1" +NVRTC_VER="${CUDA_VER}.61-1" for i in "$@"; do case $i in @@ -73,7 +73,7 @@ install_centos_requirements() { install_tensorrt() { #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))') #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}") - TRT_CUDA_VERSION="12.6" + TRT_CUDA_VERSION="12.8" if [ -z "$RELEASE_URL_TRT" ];then ARCH=${TRT_TARGETARCH}