make leader executor mode working

2025-06-19 15:52:08 +00:00 · 2024-07-08 22:08:49 +00:00 · 2024-07-08 22:08:49 +00:00 · da926feaa1
commit da926feaa1
parent f53ffa886d
2 changed files with 101 additions and 57 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -8,8 +8,8 @@
 #include <filesystem>
 #include <span>
 #include <nlohmann/json.hpp>
 #include <fmt/format.h>
 #include <nlohmann/json.hpp>
 #include <tensorrt_llm/runtime/common.h>
 #include <tensorrt_llm/executor/executor.h>
@ -20,8 +20,24 @@ namespace tle = tensorrt_llm::executor;
 namespace huggingface::tgi::backends {
    /**
     * Initialize all the components required by TRTLLM.
     * It is required to call this function before attempting to load any engine
     */
    void InitializeBackend();
    /**
     *
     * @param config
     * @param workerPath
     * @param channel
     * @return
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    /**
     *
     */
    class TensorRtLlmBackend {
    private:
        const json config;
@ -50,23 +66,30 @@ namespace huggingface::tgi::backends {
         * @param temperature
         * @param minLength
         * @param repetitionPenalty
-         * @param frequencePenalty
+         * @param frequencyPenalty
         * @param seed
         * @param nTopTokens
         * @return
         */
        [[nodiscard]] tle::IdType Submit(
-                std::vector<tle::TokenIdType> &tokens,
+                const std::vector<tle::TokenIdType> &tokens,
                int32_t maxNewTokens,
-                float_t topK,
+                int32_t topK,
                float_t topP,
                float_t temperature,
                int32_t minLength,
                std::optional<float_t> repetitionPenalty = std::nullopt,
-                std::optional<float_t> frequencePenalty = std::nullopt,
+                std::optional<float_t> frequencyPenalty = std::nullopt,
                std::optional<uint32_t> seed = std::nullopt,
                std::optional<uint32_t> nTopTokens = std::nullopt
        );
        /***
         *
         * @param reqId
         * @return
         */
        std::vector<tle::Response> Poll(tle::IdType reqId);
    };
 }
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,77 +1,98 @@
 #include <spdlog/spdlog.h>
 #include <fmt/std.h>
 #include <spdlog/spdlog.h>
 #include "backend.h"
 void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
    initTrtLlmPlugins();
 }
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-    tle::ExecutorConfig execConfig(
+    tle::ExecutorConfig execConfig(1);
            config["/build_config/max_beam_width"_json_pointer].get<int32_t>()
    );
    execConfig.setParallelConfig(tle::ParallelConfig(
            tle::CommunicationType::kMPI,
            tle::CommunicationMode::kORCHESTRATOR,
            std::nullopt,
            std::nullopt,
            tle::OrchestratorConfig(true, workerPath)
    ));
    // TODO : Need to check for >= sm_80 (ampere)
    // execConfig.setEnableChunkedContext(true)
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
    if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
        SPDLOG_INFO("Detected single engine deployment, using leader mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kLEADER,
                std::nullopt,
                std::nullopt,
                std::nullopt
        ));
    } else {
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kORCHESTRATOR,
                std::nullopt,
                std::nullopt,
                tle::OrchestratorConfig(true, workerPath)
        ));
    }
    return execConfig;
 }
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
-        const std::filesystem::path &engineFolder,
+        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
 ):
-    config(json::parse(std::ifstream(engineFolder / "config.json"))),
+    config(json::parse(std::ifstream(enginesFolder / "config.json"))),
-    executor(engineFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, GetExecutorConfig(config, executorWorker.string()))
+    executor(
        enginesFolder,
        tensorrt_llm::executor::ModelType::kDECODER_ONLY,
        GetExecutorConfig(config, executorWorker.string()
    ))
 {
-    initTrtLlmPlugins();
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string&>());
    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["version"].get<std::string>());
 }
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
-        std::vector<tle::TokenIdType> &tokens,
+        const std::vector<tle::TokenIdType> &tokens,
        const int32_t maxNewTokens,
-        const float_t topK,
+        const int32_t topK,
        const float_t topP,
        const float_t temperature,
        const int32_t minLength,
-        const std::optional<float_t> repetitionPenalty,
+        std::optional<float_t> repetitionPenalty,
-        const std::optional<float_t> frequencePenalty,
+        std::optional<float_t> frequencyPenalty,
-        const std::optional<uint32_t> seed,
+        std::optional<uint32_t> seed,
-        const std::optional<uint32_t> nTopTokens
+        std::optional<uint32_t> nTopTokens
 ) {
-//    if (IsReady()) {
+    spdlog::debug(
-//        spdlog::debug(
+            "Submitting inference over {:d} tokens to the executor {:d}",
-//                "Submitting inference over {:d} tokens to the executor {:d}",
+            tokens.size(),
-//                tokens.size(),
+            executor.getLatestIterationStats().back().numActiveRequests
-//                executor.getLatestIterationStats().back().numActiveRequests
+    );
-//        );
+
-//
+    const auto sampling = tle::SamplingConfig{
-//        const auto sampling = tle::SamplingConfig{
+            1,
-//                1,
+            topK,
-//                topK,
+            topP,
-//                topP,
+            std::nullopt,
-//                std::nullopt,
+            std::nullopt,
-//                std::nullopt,
+            std::nullopt,
-//                std::nullopt,
+            seed,
-//                seed,
+            temperature,
-//                temperature,
+            minLength,
-//                minLength,
+            std::nullopt,
-//                std::nullopt,
+            repetitionPenalty,
-//                repetitionPenalty.value_or(0.0),
+            std::nullopt,
-//                std::nullopt,
+            frequencyPenalty,
-//                frequencePenalty.value_or(1.0),
+    };
-//        };
+    const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
-//        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+    const auto request = tle::Request{tokens, maxNewTokens, true, sampling, output};
-//        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
+
-//
+    return executor.enqueueRequest(request);
-//        return executor.enqueueRequest(request);
+}
-//    }
+
-    return 0;
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
    SPDLOG_DEBUG("Polling request {:d}", reqId);
    const auto responses = executor.awaitResponses(reqId);
    return responses;
 }