diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 0703e8cc..b5d0711b 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -8,8 +8,8 @@ #include #include -#include #include +#include #include #include @@ -20,8 +20,24 @@ namespace tle = tensorrt_llm::executor; namespace huggingface::tgi::backends { + /** + * Initialize all the components required by TRTLLM. + * It is required to call this function before attempting to load any engine + */ + void InitializeBackend(); + + /** + * + * @param config + * @param workerPath + * @param channel + * @return + */ tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); + /** + * + */ class TensorRtLlmBackend { private: const json config; @@ -50,23 +66,30 @@ namespace huggingface::tgi::backends { * @param temperature * @param minLength * @param repetitionPenalty - * @param frequencePenalty + * @param frequencyPenalty * @param seed * @param nTopTokens * @return */ [[nodiscard]] tle::IdType Submit( - std::vector &tokens, + const std::vector &tokens, int32_t maxNewTokens, - float_t topK, + int32_t topK, float_t topP, float_t temperature, int32_t minLength, std::optional repetitionPenalty = std::nullopt, - std::optional frequencePenalty = std::nullopt, + std::optional frequencyPenalty = std::nullopt, std::optional seed = std::nullopt, std::optional nTopTokens = std::nullopt ); + + /*** + * + * @param reqId + * @return + */ + std::vector Poll(tle::IdType reqId); }; } diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index fc5d4594..ed9c685a 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -1,77 +1,98 @@ -#include #include +#include #include "backend.h" +void huggingface::tgi::backends::InitializeBackend() { + SPDLOG_INFO("Initializing Backend..."); + + initTrtLlmPlugins(); +} + tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { - tle::ExecutorConfig execConfig( - config["/build_config/max_beam_width"_json_pointer].get() - ); - - execConfig.setParallelConfig(tle::ParallelConfig( - tle::CommunicationType::kMPI, - tle::CommunicationMode::kORCHESTRATOR, - std::nullopt, - std::nullopt, - tle::OrchestratorConfig(true, workerPath) - )); - + tle::ExecutorConfig execConfig(1); // TODO : Need to check for >= sm_80 (ampere) // execConfig.setEnableChunkedContext(true) execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); + + if(config["/pretrained_config/mapping/world_size"_json_pointer].get() == 1){ + SPDLOG_INFO("Detected single engine deployment, using leader mode"); + execConfig.setParallelConfig(tle::ParallelConfig( + tle::CommunicationType::kMPI, + tle::CommunicationMode::kLEADER, + std::nullopt, + std::nullopt, + std::nullopt + )); + } else { + SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); + execConfig.setParallelConfig(tle::ParallelConfig( + tle::CommunicationType::kMPI, + tle::CommunicationMode::kORCHESTRATOR, + std::nullopt, + std::nullopt, + tle::OrchestratorConfig(true, workerPath) + )); + } return execConfig; } huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( - const std::filesystem::path &engineFolder, + const std::filesystem::path &enginesFolder, const std::filesystem::path &executorWorker ): - config(json::parse(std::ifstream(engineFolder / "config.json"))), - executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string())) + config(json::parse(std::ifstream(enginesFolder / "config.json"))), + executor( + enginesFolder, + tensorrt_llm::executor::ModelType::kDECODER_ONLY, + GetExecutorConfig(config, executorWorker.string() + )) { - initTrtLlmPlugins(); - SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get()); + SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); } tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( - std::vector &tokens, + const std::vector &tokens, const int32_t maxNewTokens, - const float_t topK, + const int32_t topK, const float_t topP, const float_t temperature, const int32_t minLength, - const std::optional repetitionPenalty, - const std::optional frequencePenalty, - const std::optional seed, - const std::optional nTopTokens + std::optional repetitionPenalty, + std::optional frequencyPenalty, + std::optional seed, + std::optional nTopTokens ) { -// if (IsReady()) { -// spdlog::debug( -// "Submitting inference over {:d} tokens to the executor {:d}", -// tokens.size(), -// executor.getLatestIterationStats().back().numActiveRequests -// ); -// -// const auto sampling = tle::SamplingConfig{ -// 1, -// topK, -// topP, -// std::nullopt, -// std::nullopt, -// std::nullopt, -// seed, -// temperature, -// minLength, -// std::nullopt, -// repetitionPenalty.value_or(0.0), -// std::nullopt, -// frequencePenalty.value_or(1.0), -// }; -// const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; -// const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; -// -// return executor.enqueueRequest(request); -// } - return 0; + spdlog::debug( + "Submitting inference over {:d} tokens to the executor {:d}", + tokens.size(), + executor.getLatestIterationStats().back().numActiveRequests + ); + + const auto sampling = tle::SamplingConfig{ + 1, + topK, + topP, + std::nullopt, + std::nullopt, + std::nullopt, + seed, + temperature, + minLength, + std::nullopt, + repetitionPenalty, + std::nullopt, + frequencyPenalty, + }; + const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; + const auto request = tle::Request{tokens, maxNewTokens, true, sampling, output}; + + return executor.enqueueRequest(request); } + +std::vector huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) { + SPDLOG_DEBUG("Polling request {:d}", reqId); + const auto responses = executor.awaitResponses(reqId); + return responses; +} \ No newline at end of file