From 25c6bbe142f668ac86e05c0c00db3135c7c7a1bb Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Sun, 1 Dec 2024 00:35:04 +0100
Subject: [PATCH] feat(backend): introduce backend_workspace_t to store
 precomputed information from the engine folder

---
 backends/trtllm/csrc/backend.cpp | 40 +++++++++++++-
 backends/trtllm/csrc/backend.hpp | 94 +++++++++++++++++++++++++++++++-
 backends/trtllm/csrc/ffi.hpp     | 10 +++-
 3 files changed, 138 insertions(+), 6 deletions(-)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index 2b2e0239..bc3e33de 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -1,9 +1,47 @@
 #include <ranges>
 #include "backend.hpp"
 
+#include <nlohmann/json.hpp>
 #include <spdlog/spdlog.h>
 
 namespace huggingface::tgi::backends::trtllm {
+    constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const {
+        // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
+        const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+
+        auto mode = tle::CommunicationMode::kLEADER;
+        std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+        if (world_size > 1) {
+            SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+            mode = tle::CommunicationMode::kORCHESTRATOR;
+            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
+        } else {
+            SPDLOG_INFO("Detected single engine deployment, using leader mode");
+        }
+
+        return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+    }
+
+    constexpr tle::ExecutorConfig backend_workspace_t::executor_config() const {
+        // Retrieve the compute capabilities to enable some options at runtime
+        const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
+
+        // Allocate the config
+        tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
+
+        // Set the parallel config as inferred
+        executor_config.setParallelConfig(parallel_config());
+
+        // Define some configuration variables
+        executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
+        executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
+        executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
+        return executor_config;
+    }
+
+    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
+        : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
 
     size_t backend_t::num_tokens_ready() const noexcept {
         return executor_.getNumResponsesReady();
@@ -22,7 +60,7 @@ namespace huggingface::tgi::backends::trtllm {
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
-                stop_words_
+                workspace.generation_config().stop_words
         });
     }
 
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
index d17a344c..69724187 100644
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@@ -2,15 +2,19 @@
 #include <cstdint>
 #include <exception>
 #include <expected>
+#include <fstream>
 #include <list>
 #include <span>
 
+#include <nlohmann/json.hpp>
 #include <spdlog/fmt/fmt.h>
 #include <tensorrt_llm/executor/executor.h>
 
+#include <hardware.hpp>
+
 namespace huggingface::tgi::backends::trtllm {
     namespace tle = tensorrt_llm::executor;
-
+    using json = nlohmann::json;
     using request_id_t = uint32_t;
     using token_id_t = tle::TokenIdType;
 
@@ -33,7 +37,7 @@ namespace huggingface::tgi::backends::trtllm {
         float_t temperature;
         uint64_t seed;
 
-        explicit operator tle::SamplingConfig() const {
+        constexpr explicit operator tle::SamplingConfig() const {
             return tle::SamplingConfig {
                 1,
                 top_k,
@@ -53,6 +57,79 @@ namespace huggingface::tgi::backends::trtllm {
         }
     };
 
+    /**
+     *
+     */
+    struct generation_config_t {
+        float_t top_p;
+        float_t temperature;
+        std::list<std::vector<int32_t>> stop_words;
+
+        explicit generation_config_t(const json &config):
+            top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
+            if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
+                const auto& eos_token_id = config["eos_token_id"];
+                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) {
+                    stop_words.push_back({token_id});
+                });
+            }
+        }
+    };
+
+    /**
+     *
+     */
+    class backend_workspace_t {
+    private:
+        constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
+            std::ifstream config_f(path);
+            return json::parse(config_f);
+        };
+
+        std::filesystem::path engines_folder_;
+        std::filesystem::path executor_worker_path_;
+        json config_;
+        generation_config_t generation_config_;
+
+    public:
+        backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path):
+            engines_folder_(engines_folder),
+            executor_worker_path_(executor_worker_path),
+            config_(as_json(engines_folder / "config.json")),
+            generation_config_(as_json(engines_folder / "generation_config.json")) {};
+
+        backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path):
+                engines_folder_(engines_folder),
+                executor_worker_path_(executor_worker_path),
+                config_(as_json(engines_folder / "config.json")),
+                generation_config_(as_json(engines_folder / "generation_config.json")) {};
+
+        /**
+         * Path to the folder containing the TensorRT-LLM engines
+         * @return local filesystem path to the folder
+         */
+        [[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; }
+
+        /**
+         *
+         * @return
+         */
+        [[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
+
+        /**
+         *
+         * @return
+         */
+        [[nodiscard]] constexpr tle::ParallelConfig parallel_config() const;
+
+        /**
+         *
+         * @return
+         */
+        [[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
+    };
+
+
     /**
      *
      */
@@ -63,10 +140,14 @@ namespace huggingface::tgi::backends::trtllm {
      */
     class backend_t {
     private:
+        backend_workspace_t workspace;
         tle::Executor executor_;
-        std::list<std::vector<int32_t>> stop_words_;
 
     public:
+        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
+        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
+            : backend_t(engines_folder, executor_worker_path) {};
+
         /**
          * Submit a new request to the executor
          * @param token_ids
@@ -98,6 +179,13 @@ namespace huggingface::tgi::backends::trtllm {
          */
         void cancel(request_id_t) noexcept;
     };
+
+    /**
+     * Create a TensorRT-LLM executor from a workspace
+     */
+    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
+        return { workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY, workspace.executor_config() };
+    };
 }
 
 template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t>: formatter<string_view> {
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index 311a7981..b964a064 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,4 +1,4 @@
-
+#include <memory>
 #include <tensorrt_llm/common/tllmException.h>
 
 namespace rust::behavior {
@@ -13,6 +13,7 @@ namespace rust::behavior {
 #include <spdlog/spdlog.h>
 #include <spdlog/pattern_formatter.h>
 #include <spdlog/fmt/fmt.h>
+
 #include <backend.hpp>
 
 namespace huggingface::tgi::backends::trtllm {
@@ -21,7 +22,8 @@ namespace huggingface::tgi::backends::trtllm {
         backend_t inner_;
 
     public:
-        tensorrt_llm_backend_t(std::filesystem::path &engine_folder): inner_(engine_folder) {}
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
+            : inner_(engine_folder) {}
 
         size_t num_tokens_ready() const noexcept {
             return inner_.num_tokens_ready();
@@ -61,4 +63,8 @@ namespace huggingface::tgi::backends::trtllm {
             inner_.cancel(requestId);
         }
     };
+
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
+        return std::make_unique<tensorrt_llm_backend_t>(engines_folder);
+    }
 }
\ No newline at end of file