compute the number of maximum new tokens for each request independently

2025-09-09 03:14:53 +00:00 · 2024-07-17 13:55:29 +00:00 · 2024-07-17 13:55:29 +00:00 · 9220340ff7
commit 9220340ff7
parent a01cd030d4
4 changed files with 73 additions and 47 deletions
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
 namespace huggingface::tgi::backends {
    using RequestId = tle::IdType;
    using TokenId = tle::TokenIdType;
    using TokenStreamingCallback = void(tle::TokenIdType);
    /**
     * Initialize all the components required by TRTLLM.
@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
     */
    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
    /**
     * Get the sampling configuration from the parameters provided by TGI
     * @param topK
     * @param topP
     * @param temperature
     * @param seed
     * @param beamWidth
     * @return
     */
    tle::SamplingConfig GetSamplingConfig(
            uint32_t topK,
            float_t topP,
            float_t temperature,
            uint64_t seed,
            std::optional<int32_t> beamWidth
    );
    /**
     *
     */
@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
                const std::filesystem::path &executorWorker
        );
-        /***
+        /**
         * Indicate if the backend is ready to accept incoming request
         * @return true if ready, false otherwise
         */
        [[nodiscard]] bool IsReady() const;
-        /***
+        /**
         * Query the executor for the number of token available for pulling
         * @return
         */
        [[nodiscard]] size_t NumResponsesReady() const;
-        /***
+        /**
         * Submit a new generation task to the executor
         * @param tokens
         * @param maxNewTokens
@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
                uint64_t seed
        );
-        /***
+        /**
         *
         * @param requestId The request id to poll the generation results
         * @return
         */
        std::vector<tle::Response> Poll(RequestId requestId);
-        /***
+        /**
         * Stop the underlying executor
         */
        void Shutdown();
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
    // Get the compute capabilities of the current hardware
    nvmlDevice_t device;
-    int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
+    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
    if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
        SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
-        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
+        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
-            NVML_SUCCESS) {
+            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
            SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
                        cudaComputeCapabilitiesMinor);
        }
    }
@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
+    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
    return execConfig;
 }
 tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
        uint32_t topK,
        float_t topP,
        float_t temperature,
        uint64_t seed,
        std::optional<int32_t> beamWidth = std::nullopt) {
    return tle::SamplingConfig(
            beamWidth.value_or(1),
            topK,
            topP,
            std::nullopt,
            std::nullopt,
            std::nullopt,
            seed,
            std::nullopt,
            temperature,
            std::nullopt
    );
 }
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const float_t temperature,
        const uint64_t seed
 ) {
-#ifndef NDEBUG
+#ifdef NDEBUG
-    SPDLOG_INFO(
+    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
            tokens.size(),
            executor.getLatestIterationStats().back().numActiveRequests
    );
 #else
-    SPDLOG_INFO(
+    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
            fmt::join(tokens, ", "),
-            executor.getLatestIterationStats().back().numActiveRequests
+            executor.getLatestIterationStats().front().numActiveRequests
    );
 #endif
-    const auto sampling = tle::SamplingConfig{
+    const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
-            1,
+    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
-            topK,
+
-            topP,
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
-            std::nullopt,
+    const auto output = tle::OutputConfig(false, false, false, true, false);
            std::nullopt,
            std::nullopt,
            seed,
            std::nullopt,
            temperature,
            std::nullopt,
    };
    const auto output = tle::OutputConfig{false, false, false};
    return executor.enqueueRequest(
-            tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
+            tle::Request{tokens, maxNewTokens, true, sampling, output});
 }
 [[nodiscard("Generated tokens result must be used")]]
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
-    SPDLOG_INFO("Polling status for request {}", requestId);
+    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
    return executor.awaitResponses(requestId);
 }
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
                    );
                    info!("Releasing lock for submit");
-                    return request_id;
+                    request_id
                })
                .await;
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -4,7 +4,9 @@
 #pragma once
 #include <cmath>
 #include <exception>
 #include <filesystem>
 #include <iterator>
 #include <vector>
 #include <spdlog/spdlog.h>
@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.size());
+    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
    tokens_.assign(tokens.begin(), tokens.end());
    return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
 }
-size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
+size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
-                                             rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
+        const uint64_t requestId,
-                                             rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
+        rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
        rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
-    SPDLOG_INFO("Entering StreamTokens");
+    size_t numTokens = 0;
    for (const auto &item: Poll(requestId)) {
        if (!item.hasError()) {
-            SPDLOG_INFO("\tStreamTokens -> Decoding token...");
+            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
            const auto decoded = item.getResult();
            SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());
            const auto token = decoded.outputTokenIds[0][0];
            const auto isFinal = decoded.isFinal;
-//            const auto logProb = decoded.logProbs.value()[0][0];
+            const auto logProb = decoded.logProbs.value()[0][0];
            const auto logProb = 0.0;
-            SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
+            ++numTokens;
            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
            callback(std::move(ctx), token, logProb, isFinal);
-            SPDLOG_INFO("\tStreamTokens -> Post callback");
+            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
        } else {
            // TODO : Return rest::Result with error
            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
        }
    }
-    SPDLOG_INFO("Exiting StreamTokens");
+    return numTokens;
    return 0;
 }
 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>