From f57f2a4521f2040bc4088bd079f53033c8c28b87 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 3 Jul 2024 21:12:24 +0000 Subject: [PATCH] First version loading engines and making it ready for inference --- backends/trtllm/CMakeLists.txt | 34 +++++++----- backends/trtllm/cmake/json.cmake | 5 ++ backends/trtllm/cmake/trtllm.cmake | 12 ---- backends/trtllm/include/backend.h | 15 ++++- backends/trtllm/lib/backend.cpp | 82 ++++++++++++++++++---------- backends/trtllm/src/ffi.cpp | 6 +- backends/trtllm/src/lib.rs | 16 +++++- backends/trtllm/tests/infer_test.cpp | 7 ++- 8 files changed, 114 insertions(+), 63 deletions(-) create mode 100644 backends/trtllm/cmake/json.cmake diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 77eb1ad0..ff0cb766 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -7,11 +7,27 @@ include(FetchContent) include(ExternalProject) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) -set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support") +set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located") set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located") set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located") +#### External dependencies #### +include(cmake/json.cmake) +include(cmake/spdlog.cmake) +include(cmake/trtllm.cmake) + +# TGI TRTLLM Backend definition +add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp) +target_include_directories(tgi_trtllm_backend_impl PRIVATE + $ + $ +) +include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) +target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm) +target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog) + + #### Unit Tests #### if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) message(STATUS "Building tests") @@ -23,22 +39,10 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) FetchContent_MakeAvailable(Catch2) add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp) - target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain) + target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog) list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) include(CTest) include(Catch) catch_discover_tests(tgi_trtllm_backend_tests) -endif () - -#### External dependencies #### -include(cmake/spdlog.cmake) -include(cmake/trtllm.cmake) - -# TGI TRTLLM Backend definition -add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp) -target_include_directories(tgi_trtllm_backend_impl PRIVATE - $ - $ -) -target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm) \ No newline at end of file +endif () \ No newline at end of file diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake new file mode 100644 index 00000000..a6a53589 --- /dev/null +++ b/backends/trtllm/cmake/json.cmake @@ -0,0 +1,5 @@ +fetchcontent_declare( + json + URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz +) +fetchcontent_makeavailable(json) \ No newline at end of file diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index 1003e88e..e965fb3e 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -9,18 +9,6 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST}) -#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT}) -# message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist") -#else () -# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) -# message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}") -# endif () -# -# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) -# message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}") -# endif () -#endif () - message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index de4409f3..0703e8cc 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -8,18 +8,30 @@ #include #include +#include +#include + #include #include +#include +using json = nlohmann::json; namespace tle = tensorrt_llm::executor; namespace huggingface::tgi::backends { + + tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); + class TensorRtLlmBackend { private: + const json config; tle::Executor executor; public: - explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder); + explicit TensorRtLlmBackend( + const std::filesystem::path &engineFolder, + const std::filesystem::path &executorWorker + ); /*** * Indicate if the backend is ready to accept incoming request @@ -58,4 +70,5 @@ namespace huggingface::tgi::backends { }; } + #endif //TGI_TRTLLM_BACKEND_H diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index dcc128a1..e7a5b969 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -3,9 +3,31 @@ #include "backend.h" -huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) - : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) { - SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder); +tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { + tle::ExecutorConfig execConfig( + config["/build_config/max_beam_width"_json_pointer].get() + ); + + execConfig.setParallelConfig(tle::ParallelConfig( + tle::CommunicationType::kMPI, + tle::CommunicationMode::kORCHESTRATOR, + std::nullopt, + std::nullopt, + tle::OrchestratorConfig(true, workerPath) + )); + + return execConfig; +} + +huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( + const std::filesystem::path &engineFolder, + const std::filesystem::path &executorWorker +): + config(json::parse(std::ifstream(engineFolder / "config.json"))), + executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string())) +{ + initTrtLlmPlugins(); + SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get()); } tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( @@ -20,32 +42,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::optional seed, const std::optional nTopTokens ) { - if (IsReady()) { - spdlog::debug( - "Submitting inference over {:d} tokens to the executor {:d}", - tokens.size(), - executor.getLatestIterationStats().back().numActiveRequests - ); - - const auto sampling = tle::SamplingConfig{ - 1, - topK, - topP, - std::nullopt, - std::nullopt, - std::nullopt, - seed, - temperature, - minLength, - std::nullopt, - repetitionPenalty.value_or(0.0), - std::nullopt, - frequencePenalty.value_or(1.0), - }; - const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; - const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; - - return executor.enqueueRequest(request); - } +// if (IsReady()) { +// spdlog::debug( +// "Submitting inference over {:d} tokens to the executor {:d}", +// tokens.size(), +// executor.getLatestIterationStats().back().numActiveRequests +// ); +// +// const auto sampling = tle::SamplingConfig{ +// 1, +// topK, +// topP, +// std::nullopt, +// std::nullopt, +// std::nullopt, +// seed, +// temperature, +// minLength, +// std::nullopt, +// repetitionPenalty.value_or(0.0), +// std::nullopt, +// frequencePenalty.value_or(1.0), +// }; +// const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; +// const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; +// +// return executor.enqueueRequest(request); +// } return 0; } diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 0e68c71f..142d3a50 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -12,9 +12,9 @@ namespace huggingface::tgi::backends { * @param engineFolder * @return */ - std::unique_ptr create_trtllm_backend(rust::Str engineFolder) { + std::unique_ptr create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) { const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end()); - return std::make_unique(std::move(enginePath)); + const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end()); + return std::make_unique(std::move(enginePath), std::move(executorPath)); } - } \ No newline at end of file diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index b92fc159..7f7d5442 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -10,7 +10,21 @@ mod ffi { type TensorRtLlmBackend; - fn create_trtllm_backend(engine_folder: &str) -> UniquePtr; + /// Create an instance backed behind an std::unique_ptr to manage the lifespan of the backend + /// + /// # Arguments + /// + /// * `engine_folder`: Path to the folder containing all the TRTLLM engines + /// * `executor_worker`: Path to the TRTLLM executor worker + /// + /// returns: + /// + /// # Examples + /// + /// ``` + /// + /// ``` + fn create_trtllm_backend(engine_folder: &str, executor_worker: &str) -> UniquePtr; #[rust_name = "is_ready"] fn IsReady(&self) -> bool; diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp index d59d0466..35167766 100644 --- a/backends/trtllm/tests/infer_test.cpp +++ b/backends/trtllm/tests/infer_test.cpp @@ -2,8 +2,13 @@ // Created by mfuntowicz on 7/2/24. // #include +#include #include "../include/backend.h" TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") { - huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine"); + const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/"); + const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker"); + + spdlog::info("Loading config from: {}", absolute(engines).string()); + huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor); } \ No newline at end of file