First version loading engines and making it ready for inference

2025-10-09 15:05:24 +00:00 · 2024-07-03 21:12:24 +00:00 · 2024-07-03 21:12:24 +00:00 · f57f2a4521
commit f57f2a4521
parent f8a1463915
8 changed files with 114 additions and 63 deletions
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -7,11 +7,27 @@ include(FetchContent)
 include(ExternalProject)
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
-set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
 set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
 #### External dependencies ####
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 # TGI TRTLLM Backend definition
 add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
        $<INSTALL_INTERFACE:include>
 )
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
 target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
 #### Unit Tests ####
 if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
    message(STATUS "Building tests")
@ -23,22 +39,10 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
    FetchContent_MakeAvailable(Catch2)
    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
-    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog)
    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
    include(CTest)
    include(Catch)
    catch_discover_tests(tgi_trtllm_backend_tests)
 endif ()
 #### External dependencies ####
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 # TGI TRTLLM Backend definition
 add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
        $<INSTALL_INTERFACE:include>
 )
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@ -0,0 +1,5 @@
 fetchcontent_declare(
        json
        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
 )
 fetchcontent_makeavailable(json)
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@ -9,18 +9,6 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
 set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
 #if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
 #    message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
 #else ()
 #    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 #        message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
 #    endif ()
 #
 #    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
 #        message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
 #    endif ()
 #endif ()
 message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -8,18 +8,30 @@
 #include <filesystem>
 #include <span>
 #include <nlohmann/json.hpp>
 #include <fmt/format.h>
 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
 #include <tensorrt_llm/plugins/api/tllmPlugin.h>
 using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;
 namespace huggingface::tgi::backends {
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    class TensorRtLlmBackend {
    private:
        const json config;
        tle::Executor executor;
    public:
-        explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder);
+        explicit TensorRtLlmBackend(
                const std::filesystem::path &engineFolder,
                const std::filesystem::path &executorWorker
        );
        /***
         * Indicate if the backend is ready to accept incoming request
@ -58,4 +70,5 @@ namespace huggingface::tgi::backends {
    };
 }
 #endif //TGI_TRTLLM_BACKEND_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -3,9 +3,31 @@
 #include "backend.h"
-huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-        : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {
+    tle::ExecutorConfig execConfig(
-    SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);
+            config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
    );
    execConfig.setParallelConfig(tle::ParallelConfig(
            tle::CommunicationType::kMPI,
            tle::CommunicationMode::kORCHESTRATOR,
            std::nullopt,
            std::nullopt,
            tle::OrchestratorConfig(true, workerPath)
    ));
    return execConfig;
 }
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &engineFolder,
        const std::filesystem::path &executorWorker
 ):
    config(json::parse(std::ifstream(engineFolder / "config.json"))),
    executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
 {
    initTrtLlmPlugins();
    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
 }
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
@ -20,32 +42,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const std::optional<uint32_t> seed,
        const std::optional<uint32_t> nTopTokens
 ) {
-    if (IsReady()) {
+//    if (IsReady()) {
-        spdlog::debug(
+//        spdlog::debug(
-                "Submitting inference over {:d} tokens to the executor {:d}",
+//                "Submitting inference over {:d} tokens to the executor {:d}",
-                tokens.size(),
+//                tokens.size(),
-                executor.getLatestIterationStats().back().numActiveRequests
+//                executor.getLatestIterationStats().back().numActiveRequests
-        );
+//        );
-
+//
-        const auto sampling = tle::SamplingConfig{
+//        const auto sampling = tle::SamplingConfig{
-                1,
+//                1,
-                topK,
+//                topK,
-                topP,
+//                topP,
-                std::nullopt,
+//                std::nullopt,
-                std::nullopt,
+//                std::nullopt,
-                std::nullopt,
+//                std::nullopt,
-                seed,
+//                seed,
-                temperature,
+//                temperature,
-                minLength,
+//                minLength,
-                std::nullopt,
+//                std::nullopt,
-                repetitionPenalty.value_or(0.0),
+//                repetitionPenalty.value_or(0.0),
-                std::nullopt,
+//                std::nullopt,
-                frequencePenalty.value_or(1.0),
+//                frequencePenalty.value_or(1.0),
-        };
+//        };
-        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+//        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
-        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
+//        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
-
+//
-        return executor.enqueueRequest(request);
+//        return executor.enqueueRequest(request);
-    }
+//    }
    return 0;
 }
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -12,9 +12,9 @@ namespace huggingface::tgi::backends {
    * @param engineFolder
    * @return
    */
-    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder) {
+    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {
        const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath));
+        const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath), std::move(executorPath));
    }
 }
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -10,7 +10,21 @@ mod ffi {
        type TensorRtLlmBackend;
-        fn create_trtllm_backend(engine_folder: &str) -> UniquePtr<TensorRtLlmBackend>;
+        /// Create an instance backed behind an std::unique_ptr to manage the lifespan of the backend
        ///
        /// # Arguments
        ///
        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
        /// * `executor_worker`: Path to the TRTLLM executor worker
        ///
        /// returns: <unknown>
        ///
        /// # Examples
        ///
        /// ```
        ///
        /// ```
        fn create_trtllm_backend(engine_folder: &str, executor_worker: &str) -> UniquePtr<TensorRtLlmBackend>;
        #[rust_name = "is_ready"]
        fn IsReady(&self) -> bool;
--- a/backends/trtllm/tests/infer_test.cpp
+++ b/backends/trtllm/tests/infer_test.cpp
@ -2,8 +2,13 @@
 // Created by mfuntowicz on 7/2/24.
 //
 #include <catch2/catch_all.hpp>
 #include <spdlog/spdlog.h>
 #include "../include/backend.h"
 TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
-    huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine");
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
    spdlog::info("Loading config from: {}", absolute(engines).string());
    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
 }