chore(trtllm): create specific parallelconfig factory and logging init methods

2025-07-14 20:00:17 +00:00 · 2024-10-21 23:38:42 +02:00 · 2024-10-21 23:38:42 +02:00 · 7217cafadb
commit 7217cafadb
parent 421a17544e
2 changed files with 49 additions and 12 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -25,6 +25,8 @@ namespace huggingface::tgi::backends {
    using TokenId = tle::TokenIdType;
    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
            "Submitting inference [{}] to the executor ({:d} already in-flight)");
    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
@ -36,6 +38,28 @@ namespace huggingface::tgi::backends {
     */
    void InitializeBackend();
    /**
     * Initialize logging mechanism
     */
    void huggingface::tgi::backends::InitializeLogging() {
 #ifdef NDEBUG
        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
            return std::tolower(c);
        });
        if (log_level == "debug")
            spdlog::set_level(spdlog::level::debug);
        else
            spdlog::set_level(spdlog::level::info);
    }
 #else
        spdlog::set_level(spdlog::level::debug);
 #endif
    }
    /**
     *
     * @param config TensorRT-LLM configuration object
@ -44,6 +68,14 @@ namespace huggingface::tgi::backends {
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    /**
     *
     * @param worldSize
     * @param workerPath
     * @return
     */
    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
    /**
     * Get the sampling configuration from the parameters provided by TGI
     * @param topK
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -9,22 +9,12 @@
 #include "hardware.h"
 void huggingface::tgi::backends::InitializeBackend() {
    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
            return std::tolower(c);
        });
        if (log_level == "debug")
            spdlog::set_level(spdlog::level::debug);
        else
            spdlog::set_level(spdlog::level::info);
    }
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();
    InitializeLogging();
    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
    if (numGpus.has_value()) {
@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() {
    }
 }
 [[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) {
    auto mode = tle::CommunicationMode::kLEADER;
    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
    if (worldSize > 1) {
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        mode = tle::CommunicationMode::kORCHESTRATOR;
        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
    } else {
        SPDLOG_INFO("Detected single engine deployment, using leader mode");
    }
    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
 }
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);