chore(trtllm): create specific parallelconfig factory and logging init methods

2025-10-19 20:05:24 +00:00 · 2024-10-21 23:38:42 +02:00 · 2024-10-21 23:38:42 +02:00 · 7217cafadb
commit 7217cafadb
parent 421a17544e
2 changed files with 49 additions and 12 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -25,6 +25,8 @@ namespace huggingface::tgi::backends {
    using TokenId = tle::TokenIdType;

    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
+    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
+            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
            "Submitting inference [{}] to the executor ({:d} already in-flight)");
    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
@ -36,6 +38,28 @@ namespace huggingface::tgi::backends {
     */
    void InitializeBackend();

+    /**
+     * Initialize logging mechanism
+     */
+    void huggingface::tgi::backends::InitializeLogging() {
+#ifdef NDEBUG
+        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
+    }
+#else
+        spdlog::set_level(spdlog::level::debug);
+#endif
+    }
+
+
    /**
     *
     * @param config TensorRT-LLM configuration object
@ -44,6 +68,14 @@ namespace huggingface::tgi::backends {
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);

+    /**
+     *
+     * @param worldSize
+     * @param workerPath
+     * @return
+     */
+    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
+
    /**
     * Get the sampling configuration from the parameters provided by TGI
     * @param topK
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -9,22 +9,12 @@
 #include "hardware.h"

 void huggingface::tgi::backends::InitializeBackend() {
-    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
-        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
-        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
-
-        if (log_level == "debug")
-            spdlog::set_level(spdlog::level::debug);
-        else
-            spdlog::set_level(spdlog::level::info);
-    }
-
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();

+    InitializeLogging();
+
    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
    if (numGpus.has_value()) {
@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() {
    }
 }

+[[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) {
+    auto mode = tle::CommunicationMode::kLEADER;
+    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+    if (worldSize > 1) {
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        mode = tle::CommunicationMode::kORCHESTRATOR;
+        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
+    } else {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+    }
+
+    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+}
+
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);