diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index d5c09998..46f5d079 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -7,6 +7,7 @@ include(FetchContent) include(ExternalProject) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) +option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located") set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located") @@ -32,6 +33,12 @@ target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/ target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm CUDA::nvml) target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog) +if (${TGI_TRTLLM_BACKEND_BUILD_EXAMPLES}) + add_executable(tgi_trtllm_backend_example bin/example.cpp) + target_link_libraries(tgi_trtllm_backend_example PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tgi_trtllm_backend_impl) + target_link_libraries(tgi_trtllm_backend_example PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog) +endif () + install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm spdlog) #### Unit Tests #### diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index c040c6ab..ee7fd7f2 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -1,35 +1,9 @@ +use cxx_build::CFG; use std::env; use std::path::PathBuf; -use cxx_build::CFG; - const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; -// fn build_tensort_llm>(tensorrt_llm_root_dir: P, is_debug: bool) -> PathBuf { -// let build_wheel_path = tensorrt_llm_root_dir -// .as_ref() -// .join("/scripts") -// .join("build_wheel.py"); -// -// let build_wheel_path_str = build_wheel_path.display().to_string(); -// let mut build_wheel_args = vec![ -// build_wheel_path_str.as_ref(), -// "--cpp_only", -// "--extra-cmake-vars BUILD_TESTS=OFF", -// "--extra-cmake-vars BUILD_BENCHMARKS=OFF", -// ]; -// -// if is_debug { -// build_wheel_args.push("--fast_build"); -// } -// -// let out = Command::new("python3") -// .args(build_wheel_args) -// .output() -// .expect("Failed to compile TensorRT-LLM"); -// PathBuf::new().join(tensorrt_llm_root_dir) -// } - fn main() { // Misc variables let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); @@ -39,12 +13,6 @@ fn main() { _ => false, }; - // Compile TensorRT-LLM (as of today, it cannot be compiled from CMake) - // let trtllm_path = build_tensort_llm( - // backend_path.join("build").join("_deps").join("trtllm-src"), - // is_debug, - // ); - // Build the backend implementation through CMake let backend_path = cmake::Config::new(".") .uses_cxx11() @@ -53,13 +21,30 @@ fn main() { true => "Debug", false => "Release", }) - .build_target("tgi_trtllm_backend_impl") + .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") .build(); + // Additional transitive CMake dependencies + let deps_folder = out_dir.join("build").join("_deps"); + + for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES { + let dep_name = match build_profile.as_ref() { + "debug" => format!("{}d", dependency), + _ => String::from(dependency), + }; + println!("cargo:rustc-link-lib=static={}", dep_name); + } + // Build the FFI layer calling the backend above CFG.include_prefix = "backends/trtllm"; cxx_build::bridge("src/lib.rs") .static_flag(true) + .include(deps_folder.join("fmt-src").join("include")) + .include(deps_folder.join("spdlog-src").join("include")) + .include(deps_folder.join("json-src").join("include")) + .include(deps_folder.join("trtllm-src").join("cpp").join("include")) + .include("/usr/local/cuda/include") + .include("/usr/local/tensorrt/include") .file("src/ffi.cpp") .std("c++20") .compile("tgi_trtllm_backend"); @@ -69,31 +54,55 @@ fn main() { println!("cargo:rerun-if-changed=lib/backend.cpp"); println!("cargo:rerun-if-changed=src/ffi.cpp"); - // Additional transitive CMake dependencies - for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES { - let dep_folder = out_dir - .join("build") - .join("_deps") - .join(format!("{}-build", dependency)); - - let dep_name = match build_profile.as_ref() { - "debug" => format!("{}d", dependency), - _ => String::from(dependency), - }; - println!("cargo:warning={}", dep_folder.display()); - println!("cargo:rustc-link-search=native={}", dep_folder.display()); - println!("cargo:rustc-link-lib=static={}", dep_name); - } - // Emit linkage information // - tgi_trtllm_backend (i.e. FFI layer - src/ffi.cpp) - println!(r"cargo:rustc-link-search=native={}", backend_path.display()); - println!("cargo:rustc-link-lib=static=tgi_trtllm_backend"); + let trtllm_lib_path = deps_folder + .join("trtllm-src") + .join("cpp") + .join("tensorrt_llm"); - // - tgi_trtllm_backend_impl (i.e. C++ code base to run inference include/backend.h) + let trtllm_executor_linker_search_path = + trtllm_lib_path.join("executor").join("x86_64-linux-gnu"); + + // TRTLLM libtensorrt_llm_nvrtc_wrapper.so + let trtllm_nvrtc_linker_search_path = trtllm_lib_path + .join("kernels") + .join("decoderMaskedMultiheadAttention") + .join("decoderXQAImplJIT") + .join("nvrtcWrapper") + .join("x86_64-linux-gnu"); + + println!(r"cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + println!(r"cargo:rustc-link-search=native=/usr/local/cuda/lib64/stubs"); + println!(r"cargo:rustc-link-search=native=/usr/local/tensorrt/lib"); + println!(r"cargo:rustc-link-search=native={}", backend_path.display()); + // println!( + // r"cargo:rustc-link-search=native={}/build", + // backend_path.display() + // ); println!( - r"cargo:rustc-link-search=native={}/build", - backend_path.display() + r"cargo:rustc-link-search=native={}", + backend_path.join("lib").display() ); + println!( + r"cargo:rustc-link-search=native={}", + trtllm_executor_linker_search_path.display() + ); + println!( + r"cargo:rustc-link-search=native={}", + trtllm_nvrtc_linker_search_path.display() + ); + println!("cargo:rustc-link-lib=dylib=cuda"); + println!("cargo:rustc-link-lib=dylib=cudart"); + println!("cargo:rustc-link-lib=dylib=cublas"); + println!("cargo:rustc-link-lib=dylib=cublasLt"); + println!("cargo:rustc-link-lib=dylib=mpi"); + println!("cargo:rustc-link-lib=dylib=nvidia-ml"); + println!("cargo:rustc-link-lib=dylib=nvinfer"); + println!("cargo:rustc-link-lib=dylib=nvinfer_plugin_tensorrt_llm"); + println!("cargo:rustc-link-lib=dylib=tensorrt_llm_nvrtc_wrapper"); + println!("cargo:rustc-link-lib=static=tensorrt_llm_executor_static"); + println!("cargo:rustc-link-lib=dylib=tensorrt_llm"); println!("cargo:rustc-link-lib=static=tgi_trtllm_backend_impl"); + println!("cargo:rustc-link-lib=static=tgi_trtllm_backend"); }