From f57f2a4521f2040bc4088bd079f53033c8c28b87 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Wed, 3 Jul 2024 21:12:24 +0000
Subject: [PATCH] First version loading engines and making it ready for
 inference

---
 backends/trtllm/CMakeLists.txt       | 34 +++++++-----
 backends/trtllm/cmake/json.cmake     |  5 ++
 backends/trtllm/cmake/trtllm.cmake   | 12 ----
 backends/trtllm/include/backend.h    | 15 ++++-
 backends/trtllm/lib/backend.cpp      | 82 ++++++++++++++++++----------
 backends/trtllm/src/ffi.cpp          |  6 +-
 backends/trtllm/src/lib.rs           | 16 +++++-
 backends/trtllm/tests/infer_test.cpp |  7 ++-
 8 files changed, 114 insertions(+), 63 deletions(-)
 create mode 100644 backends/trtllm/cmake/json.cmake

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 77eb1ad0..ff0cb766 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -7,11 +7,27 @@ include(FetchContent)
 include(ExternalProject)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
-set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
 set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
 
+#### External dependencies ####
+include(cmake/json.cmake)
+include(cmake/spdlog.cmake)
+include(cmake/trtllm.cmake)
+
+# TGI TRTLLM Backend definition
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
+target_include_directories(tgi_trtllm_backend_impl PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:include>
+)
+include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
+
+
 #### Unit Tests ####
 if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
     message(STATUS "Building tests")
@@ -23,22 +39,10 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
     FetchContent_MakeAvailable(Catch2)
 
     add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
-    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog)
 
     list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
     include(CTest)
     include(Catch)
     catch_discover_tests(tgi_trtllm_backend_tests)
-endif ()
-
-#### External dependencies ####
-include(cmake/spdlog.cmake)
-include(cmake/trtllm.cmake)
-
-# TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
-target_include_directories(tgi_trtllm_backend_impl PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-        $<INSTALL_INTERFACE:include>
-)
-target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)
\ No newline at end of file
+endif ()
\ No newline at end of file
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
new file mode 100644
index 00000000..a6a53589
--- /dev/null
+++ b/backends/trtllm/cmake/json.cmake
@@ -0,0 +1,5 @@
+fetchcontent_declare(
+        json
+        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+)
+fetchcontent_makeavailable(json)
\ No newline at end of file
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index 1003e88e..e965fb3e 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -9,18 +9,6 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
 set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
 
-#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
-#    message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
-#else ()
-#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
-#        message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
-#    endif ()
-#
-#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
-#        message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
-#    endif ()
-#endif ()
-
 message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
index de4409f3..0703e8cc 100644
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@@ -8,18 +8,30 @@
 #include <filesystem>
 #include <span>
 
+#include <nlohmann/json.hpp>
+#include <fmt/format.h>
+
 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
 
+using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;
 
 namespace huggingface::tgi::backends {
+
+    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
+
     class TensorRtLlmBackend {
     private:
+        const json config;
         tle::Executor executor;
 
     public:
-        explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder);
+        explicit TensorRtLlmBackend(
+                const std::filesystem::path &engineFolder,
+                const std::filesystem::path &executorWorker
+        );
 
         /***
          * Indicate if the backend is ready to accept incoming request
@@ -58,4 +70,5 @@ namespace huggingface::tgi::backends {
     };
 }
 
+
 #endif //TGI_TRTLLM_BACKEND_H
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index dcc128a1..e7a5b969 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -3,9 +3,31 @@
 
 #include "backend.h"
 
-huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)
-        : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {
-    SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(
+            config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
+    );
+
+    execConfig.setParallelConfig(tle::ParallelConfig(
+            tle::CommunicationType::kMPI,
+            tle::CommunicationMode::kORCHESTRATOR,
+            std::nullopt,
+            std::nullopt,
+            tle::OrchestratorConfig(true, workerPath)
+    ));
+
+    return execConfig;
+}
+
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
+        const std::filesystem::path &engineFolder,
+        const std::filesystem::path &executorWorker
+):
+    config(json::parse(std::ifstream(engineFolder / "config.json"))),
+    executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
+{
+    initTrtLlmPlugins();
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
 }
 
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
@@ -20,32 +42,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
         const std::optional<uint32_t> seed,
         const std::optional<uint32_t> nTopTokens
 ) {
-    if (IsReady()) {
-        spdlog::debug(
-                "Submitting inference over {:d} tokens to the executor {:d}",
-                tokens.size(),
-                executor.getLatestIterationStats().back().numActiveRequests
-        );
-
-        const auto sampling = tle::SamplingConfig{
-                1,
-                topK,
-                topP,
-                std::nullopt,
-                std::nullopt,
-                std::nullopt,
-                seed,
-                temperature,
-                minLength,
-                std::nullopt,
-                repetitionPenalty.value_or(0.0),
-                std::nullopt,
-                frequencePenalty.value_or(1.0),
-        };
-        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
-        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
-
-        return executor.enqueueRequest(request);
-    }
+//    if (IsReady()) {
+//        spdlog::debug(
+//                "Submitting inference over {:d} tokens to the executor {:d}",
+//                tokens.size(),
+//                executor.getLatestIterationStats().back().numActiveRequests
+//        );
+//
+//        const auto sampling = tle::SamplingConfig{
+//                1,
+//                topK,
+//                topP,
+//                std::nullopt,
+//                std::nullopt,
+//                std::nullopt,
+//                seed,
+//                temperature,
+//                minLength,
+//                std::nullopt,
+//                repetitionPenalty.value_or(0.0),
+//                std::nullopt,
+//                frequencePenalty.value_or(1.0),
+//        };
+//        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+//        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
+//
+//        return executor.enqueueRequest(request);
+//    }
     return 0;
 }
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
index 0e68c71f..142d3a50 100644
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@@ -12,9 +12,9 @@ namespace huggingface::tgi::backends {
     * @param engineFolder
     * @return
     */
-    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder) {
+    std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {
         const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath));
+        const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
+        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath), std::move(executorPath));
     }
-
 }
\ No newline at end of file
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index b92fc159..7f7d5442 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -10,7 +10,21 @@ mod ffi {
 
         type TensorRtLlmBackend;
 
-        fn create_trtllm_backend(engine_folder: &str) -> UniquePtr<TensorRtLlmBackend>;
+        /// Create an instance backed behind an std::unique_ptr to manage the lifespan of the backend
+        ///
+        /// # Arguments
+        ///
+        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
+        /// * `executor_worker`: Path to the TRTLLM executor worker
+        ///
+        /// returns: <unknown>
+        ///
+        /// # Examples
+        ///
+        /// ```
+        ///
+        /// ```
+        fn create_trtllm_backend(engine_folder: &str, executor_worker: &str) -> UniquePtr<TensorRtLlmBackend>;
 
         #[rust_name = "is_ready"]
         fn IsReady(&self) -> bool;
diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp
index d59d0466..35167766 100644
--- a/backends/trtllm/tests/infer_test.cpp
+++ b/backends/trtllm/tests/infer_test.cpp
@@ -2,8 +2,13 @@
 // Created by mfuntowicz on 7/2/24.
 //
 #include <catch2/catch_all.hpp>
+#include <spdlog/spdlog.h>
 #include "../include/backend.h"
 
 TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
-    huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine");
+    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
+    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
+
+    spdlog::info("Loading config from: {}", absolute(engines).string());
+    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
 }
\ No newline at end of file