compute the number of maximum new tokens for each request independently

2025-09-08 19:04:52 +00:00 · 2024-07-17 13:55:29 +00:00 · 2024-07-17 13:55:29 +00:00 · 9220340ff7
commit 9220340ff7
parent a01cd030d4
4 changed files with 73 additions and 47 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
 namespace huggingface::tgi::backends {
    using RequestId = tle::IdType;
    using TokenId = tle::TokenIdType;
-    using TokenStreamingCallback = void(tle::TokenIdType);

    /**
     * Initialize all the components required by TRTLLM.
@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);

+    /**
+     * Get the sampling configuration from the parameters provided by TGI
+     * @param topK
+     * @param topP
+     * @param temperature
+     * @param seed
+     * @param beamWidth
+     * @return
+     */
+    tle::SamplingConfig GetSamplingConfig(
+            uint32_t topK,
+            float_t topP,
+            float_t temperature,
+            uint64_t seed,
+            std::optional<int32_t> beamWidth
+    );
+
    /**
     *
     */
@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
                const std::filesystem::path &executorWorker
        );

-        /***
+        /**
         * Indicate if the backend is ready to accept incoming request
         * @return true if ready, false otherwise
         */
        [[nodiscard]] bool IsReady() const;

-        /***
+        /**
         * Query the executor for the number of token available for pulling
         * @return
         */
        [[nodiscard]] size_t NumResponsesReady() const;

-        /***
+        /**
         * Submit a new generation task to the executor
         * @param tokens
         * @param maxNewTokens
@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
                uint64_t seed
        );

-        /***
+        /**
         *
         * @param requestId The request id to poll the generation results
         * @return
         */
        std::vector<tle::Response> Poll(RequestId requestId);

-        /***
+        /**
         * Stop the underlying executor
         */
        void Shutdown();
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co

    // Get the compute capabilities of the current hardware
    nvmlDevice_t device;
-    int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
+    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
    if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
        SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
-        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
-            NVML_SUCCESS) {
-            SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
-                        cudaComputeCapabilitiesMinor);
+        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
+            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
        }
    }

@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co

    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
+    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
    return execConfig;
 }

+tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
+        uint32_t topK,
+        float_t topP,
+        float_t temperature,
+        uint64_t seed,
+        std::optional<int32_t> beamWidth = std::nullopt) {
+    return tle::SamplingConfig(
+            beamWidth.value_or(1),
+            topK,
+            topP,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            seed,
+            std::nullopt,
+            temperature,
+            std::nullopt
+    );
+}
+
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const float_t temperature,
        const uint64_t seed
 ) {
-#ifndef NDEBUG
-    SPDLOG_INFO(
+#ifdef NDEBUG
+    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
            tokens.size(),
            executor.getLatestIterationStats().back().numActiveRequests
    );
 #else
-    SPDLOG_INFO(
+    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
            fmt::join(tokens, ", "),
-            executor.getLatestIterationStats().back().numActiveRequests
+            executor.getLatestIterationStats().front().numActiveRequests
    );
 #endif

-    const auto sampling = tle::SamplingConfig{
-            1,
-            topK,
-            topP,
-            std::nullopt,
-            std::nullopt,
-            std::nullopt,
-            seed,
-            std::nullopt,
-            temperature,
-            std::nullopt,
-    };
-    const auto output = tle::OutputConfig{false, false, false};
+    const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
+    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
+
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
+    const auto output = tle::OutputConfig(false, false, false, true, false);
    return executor.enqueueRequest(
-            tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
+            tle::Request{tokens, maxNewTokens, true, sampling, output});
 }

 [[nodiscard("Generated tokens result must be used")]]
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
-    SPDLOG_INFO("Polling status for request {}", requestId);
+    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
    return executor.awaitResponses(requestId);
 }

--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
                    );

                    info!("Releasing lock for submit");
-                    return request_id;
+                    request_id
                })
                .await;

--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -4,7 +4,9 @@
 #pragma once

 #include <cmath>
+#include <exception>
 #include <filesystem>
+#include <iterator>
 #include <vector>

 #include <spdlog/spdlog.h>
@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {

    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.size());
-    tokens_.assign(tokens.begin(), tokens.end());
-
+    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
    return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
 }

-size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
-                                             rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
-                                             rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
+size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
+        const uint64_t requestId,
+        rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
+        rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {

-    SPDLOG_INFO("Entering StreamTokens");
+    size_t numTokens = 0;
    for (const auto &item: Poll(requestId)) {
        if (!item.hasError()) {
-            SPDLOG_INFO("\tStreamTokens -> Decoding token...");
+            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
            const auto decoded = item.getResult();
-            SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());

            const auto token = decoded.outputTokenIds[0][0];
            const auto isFinal = decoded.isFinal;
-//            const auto logProb = decoded.logProbs.value()[0][0];
-            const auto logProb = 0.0;
+            const auto logProb = decoded.logProbs.value()[0][0];

-            SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+            ++numTokens;
+
+            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
            callback(std::move(ctx), token, logProb, isFinal);
-            SPDLOG_INFO("\tStreamTokens -> Post callback");
+            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
        } else {
            // TODO : Return rest::Result with error
            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
        }
    }

-    SPDLOG_INFO("Exiting StreamTokens");
-    return 0;
+    return numTokens;
 }

 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>