From 7217cafadb277b936a97d8b685a215beee881b98 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 23:38:42 +0200 Subject: [PATCH] chore(trtllm): create specific parallelconfig factory and logging init methods --- backends/trtllm/include/backend.h | 32 +++++++++++++++++++++++++++++++ backends/trtllm/lib/backend.cpp | 29 ++++++++++++++++------------ 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 1793b2dd..cbfaacf1 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -25,6 +25,8 @@ namespace huggingface::tgi::backends { using TokenId = tle::TokenIdType; const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); + constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING( + "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})"); constexpr auto FMT_EXECUTOR_STATS = FMT_STRING( "Submitting inference [{}] to the executor ({:d} already in-flight)"); constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING( @@ -36,6 +38,28 @@ namespace huggingface::tgi::backends { */ void InitializeBackend(); + /** + * Initialize logging mechanism + */ + void huggingface::tgi::backends::InitializeLogging() { +#ifdef NDEBUG + if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { + std::string log_level(TRTLLM_LOG_LEVEL_CSTR); + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if (log_level == "debug") + spdlog::set_level(spdlog::level::debug); + else + spdlog::set_level(spdlog::level::info); + } +#else + spdlog::set_level(spdlog::level::debug); +#endif + } + + /** * * @param config TensorRT-LLM configuration object @@ -44,6 +68,14 @@ namespace huggingface::tgi::backends { */ tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); + /** + * + * @param worldSize + * @param workerPath + * @return + */ + tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept; + /** * Get the sampling configuration from the parameters provided by TGI * @param topK diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 2750b423..a9d37bc1 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -9,22 +9,12 @@ #include "hardware.h" void huggingface::tgi::backends::InitializeBackend() { - if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { - std::string log_level(TRTLLM_LOG_LEVEL_CSTR); - std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { - return std::tolower(c); - }); - - if (log_level == "debug") - spdlog::set_level(spdlog::level::debug); - else - spdlog::set_level(spdlog::level::info); - } - SPDLOG_INFO("Initializing Backend..."); nvmlInit_v2(); initTrtLlmPlugins(); + InitializeLogging(); + SPDLOG_INFO("Backend Executor Version: {}", tle::version()); const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); if (numGpus.has_value()) { @@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() { } } +[[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) { + auto mode = tle::CommunicationMode::kLEADER; + std::optional orchestratorConfig = std::nullopt; + + if (worldSize > 1) { + SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); + mode = tle::CommunicationMode::kORCHESTRATOR; + orchestratorConfig = std::make_optional(true, workerPath, nullptr, true); + } else { + SPDLOG_INFO("Detected single engine deployment, using leader mode"); + } + + return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig); +} + [[nodiscard]] tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);