chore(trtllm): create specific parallelconfig factory and logging init methods

This commit is contained in:
Morgan Funtowicz 2024-10-21 23:38:42 +02:00
parent 421a17544e
commit 7217cafadb
2 changed files with 49 additions and 12 deletions

View File

@ -25,6 +25,8 @@ namespace huggingface::tgi::backends {
using TokenId = tle::TokenIdType; using TokenId = tle::TokenIdType;
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
"Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING( constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
"Submitting inference [{}] to the executor ({:d} already in-flight)"); "Submitting inference [{}] to the executor ({:d} already in-flight)");
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING( constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
@ -36,6 +38,28 @@ namespace huggingface::tgi::backends {
*/ */
void InitializeBackend(); void InitializeBackend();
/**
* Initialize logging mechanism
*/
void huggingface::tgi::backends::InitializeLogging() {
#ifdef NDEBUG
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (log_level == "debug")
spdlog::set_level(spdlog::level::debug);
else
spdlog::set_level(spdlog::level::info);
}
#else
spdlog::set_level(spdlog::level::debug);
#endif
}
/** /**
* *
* @param config TensorRT-LLM configuration object * @param config TensorRT-LLM configuration object
@ -44,6 +68,14 @@ namespace huggingface::tgi::backends {
*/ */
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
/**
*
* @param worldSize
* @param workerPath
* @return
*/
tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
/** /**
* Get the sampling configuration from the parameters provided by TGI * Get the sampling configuration from the parameters provided by TGI
* @param topK * @param topK

View File

@ -9,22 +9,12 @@
#include "hardware.h" #include "hardware.h"
void huggingface::tgi::backends::InitializeBackend() { void huggingface::tgi::backends::InitializeBackend() {
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (log_level == "debug")
spdlog::set_level(spdlog::level::debug);
else
spdlog::set_level(spdlog::level::info);
}
SPDLOG_INFO("Initializing Backend..."); SPDLOG_INFO("Initializing Backend...");
nvmlInit_v2(); nvmlInit_v2();
initTrtLlmPlugins(); initTrtLlmPlugins();
InitializeLogging();
SPDLOG_INFO("Backend Executor Version: {}", tle::version()); SPDLOG_INFO("Backend Executor Version: {}", tle::version());
const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
if (numGpus.has_value()) { if (numGpus.has_value()) {
@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() {
} }
} }
[[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) {
auto mode = tle::CommunicationMode::kLEADER;
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
if (worldSize > 1) {
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
mode = tle::CommunicationMode::kORCHESTRATOR;
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
} else {
SPDLOG_INFO("Detected single engine deployment, using leader mode");
}
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
}
[[nodiscard]] [[nodiscard]]
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1); tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);