make leader executor mode working

2025-09-09 03:14:53 +00:00 · 2024-07-08 22:08:49 +00:00 · 2024-07-08 22:08:49 +00:00 · da926feaa1
commit da926feaa1
parent f53ffa886d
2 changed files with 101 additions and 57 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -8,8 +8,8 @@
 #include <filesystem>
 #include <span>

-#include <nlohmann/json.hpp>
 #include <fmt/format.h>
+#include <nlohmann/json.hpp>

 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
@ -20,8 +20,24 @@ namespace tle = tensorrt_llm::executor;

 namespace huggingface::tgi::backends {

+    /**
+     * Initialize all the components required by TRTLLM.
+     * It is required to call this function before attempting to load any engine
+     */
+    void InitializeBackend();
+
+    /**
+     *
+     * @param config
+     * @param workerPath
+     * @param channel
+     * @return
+     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);

+    /**
+     *
+     */
    class TensorRtLlmBackend {
    private:
        const json config;
@ -50,23 +66,30 @@ namespace huggingface::tgi::backends {
         * @param temperature
         * @param minLength
         * @param repetitionPenalty
-         * @param frequencePenalty
+         * @param frequencyPenalty
         * @param seed
         * @param nTopTokens
         * @return
         */
        [[nodiscard]] tle::IdType Submit(
-                std::vector<tle::TokenIdType> &tokens,
+                const std::vector<tle::TokenIdType> &tokens,
                int32_t maxNewTokens,
-                float_t topK,
+                int32_t topK,
                float_t topP,
                float_t temperature,
                int32_t minLength,
                std::optional<float_t> repetitionPenalty = std::nullopt,
-                std::optional<float_t> frequencePenalty = std::nullopt,
+                std::optional<float_t> frequencyPenalty = std::nullopt,
                std::optional<uint32_t> seed = std::nullopt,
                std::optional<uint32_t> nTopTokens = std::nullopt
        );
+
+        /***
+         *
+         * @param reqId
+         * @return
+         */
+        std::vector<tle::Response> Poll(tle::IdType reqId);
    };
 }

--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,13 +1,32 @@
-#include <spdlog/spdlog.h>
 #include <fmt/std.h>
+#include <spdlog/spdlog.h>

 #include "backend.h"

-tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-    tle::ExecutorConfig execConfig(
-            config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
-    );
+void huggingface::tgi::backends::InitializeBackend() {
+    SPDLOG_INFO("Initializing Backend...");

+    initTrtLlmPlugins();
+}
+
+tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
+    tle::ExecutorConfig execConfig(1);
+
+    // TODO : Need to check for >= sm_80 (ampere)
+    // execConfig.setEnableChunkedContext(true)
+    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
+
+    if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+        execConfig.setParallelConfig(tle::ParallelConfig(
+                tle::CommunicationType::kMPI,
+                tle::CommunicationMode::kLEADER,
+                std::nullopt,
+                std::nullopt,
+                std::nullopt
+        ));
+    } else {
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kORCHESTRATOR,
@ -15,63 +34,65 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
                std::nullopt,
                tle::OrchestratorConfig(true, workerPath)
        ));
-
-
-    // TODO : Need to check for >= sm_80 (ampere)
-    // execConfig.setEnableChunkedContext(true)
-    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
+    }
    return execConfig;
 }

 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
-        const std::filesystem::path &engineFolder,
+        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
 ):
-    config(json::parse(std::ifstream(engineFolder / "config.json"))),
-    executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
+    config(json::parse(std::ifstream(enginesFolder / "config.json"))),
+    executor(
+        enginesFolder,
+        tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+        GetExecutorConfig(config, executorWorker.string()
+    ))
 {
-    initTrtLlmPlugins();
-    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string&>());
 }

 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
-        std::vector<tle::TokenIdType> &tokens,
+        const std::vector<tle::TokenIdType> &tokens,
        const int32_t maxNewTokens,
-        const float_t topK,
+        const int32_t topK,
        const float_t topP,
        const float_t temperature,
        const int32_t minLength,
-        const std::optional<float_t> repetitionPenalty,
-        const std::optional<float_t> frequencePenalty,
-        const std::optional<uint32_t> seed,
-        const std::optional<uint32_t> nTopTokens
+        std::optional<float_t> repetitionPenalty,
+        std::optional<float_t> frequencyPenalty,
+        std::optional<uint32_t> seed,
+        std::optional<uint32_t> nTopTokens
 ) {
-//    if (IsReady()) {
-//        spdlog::debug(
-//                "Submitting inference over {:d} tokens to the executor {:d}",
-//                tokens.size(),
-//                executor.getLatestIterationStats().back().numActiveRequests
-//        );
-//
-//        const auto sampling = tle::SamplingConfig{
-//                1,
-//                topK,
-//                topP,
-//                std::nullopt,
-//                std::nullopt,
-//                std::nullopt,
-//                seed,
-//                temperature,
-//                minLength,
-//                std::nullopt,
-//                repetitionPenalty.value_or(0.0),
-//                std::nullopt,
-//                frequencePenalty.value_or(1.0),
-//        };
-//        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
-//        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
-//
-//        return executor.enqueueRequest(request);
-//    }
-    return 0;
+    spdlog::debug(
+            "Submitting inference over {:d} tokens to the executor {:d}",
+            tokens.size(),
+            executor.getLatestIterationStats().back().numActiveRequests
+    );
+
+    const auto sampling = tle::SamplingConfig{
+            1,
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            temperature,
+            minLength,
+            std::nullopt,
+            repetitionPenalty,
+            std::nullopt,
+            frequencyPenalty,
+    };
+    const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+    const auto request = tle::Request{tokens, maxNewTokens, true, sampling, output};
+
+    return executor.enqueueRequest(request);
+}
+
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
+    SPDLOG_DEBUG("Polling request {:d}", reqId);
+    const auto responses = executor.awaitResponses(reqId);
+    return responses;
 }