mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 00:12:08 +00:00
chore(trtllm): create specific parallelconfig factory and logging init methods
This commit is contained in:
parent
421a17544e
commit
7217cafadb
@ -25,6 +25,8 @@ namespace huggingface::tgi::backends {
|
|||||||
using TokenId = tle::TokenIdType;
|
using TokenId = tle::TokenIdType;
|
||||||
|
|
||||||
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
|
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
|
||||||
|
constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
|
||||||
|
"Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
|
||||||
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
|
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
|
||||||
"Submitting inference [{}] to the executor ({:d} already in-flight)");
|
"Submitting inference [{}] to the executor ({:d} already in-flight)");
|
||||||
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
|
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
|
||||||
@ -36,6 +38,28 @@ namespace huggingface::tgi::backends {
|
|||||||
*/
|
*/
|
||||||
void InitializeBackend();
|
void InitializeBackend();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize logging mechanism
|
||||||
|
*/
|
||||||
|
void huggingface::tgi::backends::InitializeLogging() {
|
||||||
|
#ifdef NDEBUG
|
||||||
|
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||||
|
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||||
|
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||||
|
return std::tolower(c);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (log_level == "debug")
|
||||||
|
spdlog::set_level(spdlog::level::debug);
|
||||||
|
else
|
||||||
|
spdlog::set_level(spdlog::level::info);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
spdlog::set_level(spdlog::level::debug);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param config TensorRT-LLM configuration object
|
* @param config TensorRT-LLM configuration object
|
||||||
@ -44,6 +68,14 @@ namespace huggingface::tgi::backends {
|
|||||||
*/
|
*/
|
||||||
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param worldSize
|
||||||
|
* @param workerPath
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the sampling configuration from the parameters provided by TGI
|
* Get the sampling configuration from the parameters provided by TGI
|
||||||
* @param topK
|
* @param topK
|
||||||
|
@ -9,22 +9,12 @@
|
|||||||
#include "hardware.h"
|
#include "hardware.h"
|
||||||
|
|
||||||
void huggingface::tgi::backends::InitializeBackend() {
|
void huggingface::tgi::backends::InitializeBackend() {
|
||||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
|
||||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
|
||||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
|
||||||
return std::tolower(c);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (log_level == "debug")
|
|
||||||
spdlog::set_level(spdlog::level::debug);
|
|
||||||
else
|
|
||||||
spdlog::set_level(spdlog::level::info);
|
|
||||||
}
|
|
||||||
|
|
||||||
SPDLOG_INFO("Initializing Backend...");
|
SPDLOG_INFO("Initializing Backend...");
|
||||||
nvmlInit_v2();
|
nvmlInit_v2();
|
||||||
initTrtLlmPlugins();
|
initTrtLlmPlugins();
|
||||||
|
|
||||||
|
InitializeLogging();
|
||||||
|
|
||||||
SPDLOG_INFO("Backend Executor Version: {}", tle::version());
|
SPDLOG_INFO("Backend Executor Version: {}", tle::version());
|
||||||
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
|
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
|
||||||
if (numGpus.has_value()) {
|
if (numGpus.has_value()) {
|
||||||
@ -34,6 +24,21 @@ void huggingface::tgi::backends::InitializeBackend() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] tle::ParallelConfig GetParallelConfig(const size_t worldSize, std::string workerPath) {
|
||||||
|
auto mode = tle::CommunicationMode::kLEADER;
|
||||||
|
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||||
|
|
||||||
|
if (worldSize > 1) {
|
||||||
|
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||||
|
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||||
|
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
|
||||||
|
} else {
|
||||||
|
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
|
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
|
||||||
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
|
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
|
||||||
|
Loading…
Reference in New Issue
Block a user