First version loading engines and making it ready for inference

2025-10-08 22:45:23 +00:00 · 2024-07-03 21:12:24 +00:00 · 2024-07-03 21:12:24 +00:00 · f57f2a4521
commit f57f2a4521
parent f8a1463915
8 changed files with 114 additions and 63 deletions
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -7,11 +7,27 @@ include(FetchContent)
 include(ExternalProject)

 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
-set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
 set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")

+#### External dependencies ####
+include(cmake/json.cmake)
+include(cmake/spdlog.cmake)
+include(cmake/trtllm.cmake)
+
+# TGI TRTLLM Backend definition
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
+target_include_directories(tgi_trtllm_backend_impl PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>
+)
+include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
+
+
 #### Unit Tests ####
 if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
    message(STATUS "Building tests")
@ -23,22 +39,10 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
    FetchContent_MakeAvailable(Catch2)

    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
-    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog)

    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
    include(CTest)
    include(Catch)
    catch_discover_tests(tgi_trtllm_backend_tests)
-endif ()
-
-#### External dependencies ####
-include(cmake/spdlog.cmake)
-include(cmake/trtllm.cmake)
-
-# TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
-target_include_directories(tgi_trtllm_backend_impl PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-        $<INSTALL_INTERFACE:include>
-)
-target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)
+endif ()
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@ -0,0 +1,5 @@
+fetchcontent_declare(
+        json
+        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+)
+fetchcontent_makeavailable(json)
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@ -9,18 +9,6 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
 set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})

-#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
-#    message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
-#else ()
-#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
-#        message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
-#    endif ()
-#
-#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
-#        message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
-#    endif ()
-#endif ()
-
 message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")

 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -8,18 +8,30 @@
 #include <filesystem>
 #include <span>

+#include <nlohmann/json.hpp>
+#include <fmt/format.h>
+
 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>

+using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;

 namespace huggingface::tgi::backends {
+
+    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
+
    class TensorRtLlmBackend {
    private:
+        const json config;
        tle::Executor executor;

    public:
-        explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder);
+        explicit TensorRtLlmBackend(
+                const std::filesystem::path &engineFolder,
+                const std::filesystem::path &executorWorker
+        );

        /***
         * Indicate if the backend is ready to accept incoming request
@ -58,4 +70,5 @@ namespace huggingface::tgi::backends {
    };
 }

+
 #endif //TGI_TRTLLM_BACKEND_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -3,9 +3,31 @@

 #include "backend.h"

-huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)
-        : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {
-    SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(
+            config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
+    );
+
+    execConfig.setParallelConfig(tle::ParallelConfig(
+            tle::CommunicationType::kMPI,
+            tle::CommunicationMode::kORCHESTRATOR,
+            std::nullopt,
+            std::nullopt,
+            tle::OrchestratorConfig(true, workerPath)
+    ));
+
+    return execConfig;
+}
+
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
+        const std::filesystem::path &engineFolder,
+        const std::filesystem::path &executorWorker
+):
+    config(json::parse(std::ifstream(engineFolder / "config.json"))),
+    executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
+{
+    initTrtLlmPlugins();
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
 }

 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
@ -20,32 +42,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const std::optional<uint32_t> seed,
        const std::optional<uint32_t> nTopTokens
 ) {
-    if (IsReady()) {
-        spdlog::debug(
-                "Submitting inference over {:d} tokens to the executor {:d}",
-                tokens.size(),
-                executor.getLatestIterationStats().back().numActiveRequests
-        );
-
-        const auto sampling = tle::SamplingConfig{
-                1,
-                topK,
-                topP,
-                std::nullopt,
-                std::nullopt,
-                std::nullopt,
-                seed,
-                temperature,
-                minLength,
-                std::nullopt,
-                repetitionPenalty.value_or(0.0),
-                std::nullopt,
-                frequencePenalty.value_or(1.0),
-        };
-        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
-        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
-
-        return executor.enqueueRequest(request);
-    }
+//    if (IsReady()) {
+//        spdlog::debug(
+//                "Submitting inference over {:d} tokens to the executor {:d}",
+//                tokens.size(),
+//                executor.getLatestIterationStats().back().numActiveRequests
+//        );
+//
+//        const auto sampling = tle::SamplingConfig{
+//                1,
+//                topK,
+//                topP,
+//                std::nullopt,
+//                std::nullopt,
+//                std::nullopt,
+//                seed,
+//                temperature,
+//                minLength,
+//                std::nullopt,
+//                repetitionPenalty.value_or(0.0),
+//                std::nullopt,
+//                frequencePenalty.value_or(1.0),
+//        };
+//        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+//        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
+//
+//        return executor.enqueueRequest(request);
+//    }
    return 0;
 }
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -12,9 +12,9 @@ namespace huggingface::tgi::backends {
    * @param engineFolder
    * @return
    */
-    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder) {
+    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {
        const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath));
+        const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
+        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath), std::move(executorPath));
    }
-
 }
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -10,7 +10,21 @@ mod ffi {

        type TensorRtLlmBackend;

-        fn create_trtllm_backend(engine_folder: &str) -> UniquePtr<TensorRtLlmBackend>;
+        /// Create an instance backed behind an std::unique_ptr to manage the lifespan of the backend
+        ///
+        /// # Arguments
+        ///
+        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
+        /// * `executor_worker`: Path to the TRTLLM executor worker
+        ///
+        /// returns: <unknown>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
+        fn create_trtllm_backend(engine_folder: &str, executor_worker: &str) -> UniquePtr<TensorRtLlmBackend>;

        #[rust_name = "is_ready"]
        fn IsReady(&self) -> bool;
--- a/backends/trtllm/tests/infer_test.cpp
+++ b/backends/trtllm/tests/infer_test.cpp
@ -2,8 +2,13 @@
 // Created by mfuntowicz on 7/2/24.
 //
 #include <catch2/catch_all.hpp>
+#include <spdlog/spdlog.h>
 #include "../include/backend.h"

 TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
-    huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine");
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
+    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
+
+    spdlog::info("Loading config from: {}", absolute(engines).string());
+    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
 }