From 7217cafadb277b936a97d8b685a215beee881b98 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 21 Oct 2024 23:38:42 +0200
Subject: [PATCH] chore(trtllm): create specific parallelconfig factory and
 logging init methods

---
 backends/trtllm/include/backend.h | 32 +++++++++++++++++++++++++++++++
 backends/trtllm/lib/backend.cpp   | 29 ++++++++++++++++------------
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
index 1793b2dd..cbfaacf1 100644
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@@ -25,6 +25,8 @@ namespace huggingface::tgi::backends {
     using TokenId = tle::TokenIdType;
 
     const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
+    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
+            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
     constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
             "Submitting inference [{}] to the executor ({:d} already in-flight)");
     constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
@@ -36,6 +38,28 @@ namespace huggingface::tgi::backends {
      */
     void InitializeBackend();
 
+    /**
+     * Initialize logging mechanism
+     */
+    void huggingface::tgi::backends::InitializeLogging() {
+#ifdef NDEBUG
+        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
+    }
+#else
+        spdlog::set_level(spdlog::level::debug);
+#endif
+    }
+
+
     /**
      *
      * @param config TensorRT-LLM configuration object
@@ -44,6 +68,14 @@ namespace huggingface::tgi::backends {
      */
     tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
 
+    /**
+     *
+     * @param worldSize
+     * @param workerPath
+     * @return
+     */
+    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
+
     /**
      * Get the sampling configuration from the parameters provided by TGI
      * @param topK
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index 2750b423..a9d37bc1 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -9,22 +9,12 @@
 #include "hardware.h"
 
 void huggingface::tgi::backends::InitializeBackend() {
-    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
-        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
-        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
-
-        if (log_level == "debug")
-            spdlog::set_level(spdlog::level::debug);
-        else
-            spdlog::set_level(spdlog::level::info);
-    }
-
     SPDLOG_INFO("Initializing Backend...");
     nvmlInit_v2();
     initTrtLlmPlugins();
 
+    InitializeLogging();
+
     SPDLOG_INFO("Backend Executor Version: {}", tle::version());
     const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
     if (numGpus.has_value()) {
@@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() {
     }
 }
 
+[[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) {
+    auto mode = tle::CommunicationMode::kLEADER;
+    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+    if (worldSize > 1) {
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        mode = tle::CommunicationMode::kORCHESTRATOR;
+        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
+    } else {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+    }
+
+    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+}
+
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
     tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);