feat(backend): impl missing generation_step_t as return value of pull_tokens

2025-07-15 20:30:16 +00:00 · 2024-12-02 23:28:25 +01:00 · 2024-12-02 23:28:25 +01:00 · 2f8634ec01
commit 2f8634ec01
parent a7bad25c41
7 changed files with 75 additions and 120 deletions
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -4,7 +4,7 @@ use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
 const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
        .std("c++23")
        .include(deps_folder.join("fmt-src").join("include"))
        .include(deps_folder.join("spdlog-src").join("include"))
        .include(deps_folder.join("json-src").join("include"))
        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    println!("cargo:rerun-if-changed=CMakeLists.txt");
    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
    println!("cargo:rerun-if-changed=cmake/json.cmake");
    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
    println!("cargo:rerun-if-changed=csrc/backend.hpp");
    println!("cargo:rerun-if-changed=csrc/backend.cpp");
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
    }
    std::expected<request_id_t, backend_error_t>
-    backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
        return executor_.enqueueRequest(tle::Request {
                {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@ -1,3 +1,4 @@
 #pragma once
 #include <cmath>
 #include <cstdint>
 #include <exception>
@ -17,7 +18,7 @@
 namespace huggingface::tgi::backends::trtllm {
    namespace tle = tensorrt_llm::executor;
    using json = nlohmann::json;
-    using request_id_t = uint32_t;
+    using request_id_t = uint64_t;
    using token_id_t = tle::TokenIdType;
    /**
@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm {
        float_t top_p;
        float_t repetition_penalty;
        float_t frequency_penalty;
        float_t length_penalty;
        float_t temperature;
        uint64_t seed;
@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm {
                repetition_penalty,
                std::nullopt,
                frequency_penalty,
-                length_penalty
+                std::nullopt
            };
        }
    };
@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm {
         */
        [[nodiscard("Discarded executor request_id needs to be assigned")]]
        std::expected<request_id_t, backend_error_t>
-        submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
+        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
        /**
         * Query the number of tokens available across all in-flight generations
@ -216,8 +216,8 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
        return fmt::format_to(
                ctx.out(),
-                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
+                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
-                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
+                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
        );
    }
 };
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@ -1,9 +1,18 @@
 #ifndef TGI_BACKEND_TRTLLM_FFI
 #define TGI_BACKEND_TRTLLM_FFI
 #include <memory>
 #include <thread>
 #include <tensorrt_llm/common/tllmException.h>
 #include <tensorrt_llm/plugins/api/tllmPlugin.h>
 #include <spdlog/spdlog.h>
 #include <spdlog/pattern_formatter.h>
 #include <spdlog/fmt/fmt.h>
 #include <backend.hpp>
 namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
@ -13,11 +22,11 @@ namespace rust::behavior {
    }
 }
-#include <spdlog/spdlog.h>
+namespace huggingface::tgi::backends::trtllm {
-#include <spdlog/pattern_formatter.h>
+    class tensorrt_llm_backend_t;
-#include <spdlog/fmt/fmt.h>
+}
-#include <backend.hpp>
+#include "backends/trtllm/src/lib.rs.h"
 namespace huggingface::tgi::backends::trtllm {
    std::once_flag backend_initialized_flag;
@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm {
            SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
            // Submit the request to the executor and get back a potential request_id used to track request status
            const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
            const auto maybe_request_id = inner_.submit(
-                {tokens.data(), tokens.size()},
+                signed_tokens,
                {max_new_tokens},
                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
            );
@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm {
            }
        }
        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
            if(num_tokens_ready() > 0) [[likely]] {
                const auto responses = inner_.pull_tokens();
                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
                // Transform tle::Response to GenerationStep
                auto steps = std::make_unique<std::vector<generation_step_t>>();
                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
                    const auto reqId = r.getRequestId();
                    if (!r.hasError()) [[likely]] {
                        const auto result = r.getResult();
                        return generation_step_t{
                                reqId,
                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
                                result.logProbs.value()[0][0],
                                result.isFinal,
                                false,
                                std::string()
                        };
                    } else {
                        return generation_step_t{
                                reqId,
                                0,
                                0.0,
                                true,
                                true,
                                std::move(r.getErrorMsg())
                        };
                    }
                });
                return steps;
            } else {
                return std::make_unique<std::vector<generation_step_t>>();
            }
        }
        void cancel(request_id_t requestId) noexcept {
            SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
            inner_.cancel(requestId);
@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm {
        }
    }
-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
        std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
        return std::make_unique<tensorrt_llm_backend_t>(
            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
        );
    }
 }
 #endif
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -1,89 +0,0 @@
 //
 // Created by mfuntowicz on 6/30/24.
 //
 #pragma once
 #include <algorithm>
 #include <exception>
 #include <filesystem>
 #include <functional>
 #include <limits>
 #include <iterator>
 #include <ranges>
 #include <vector>
 #include <spdlog/spdlog.h>
 #include "backends/trtllm/include/ffi.h"
 huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
        const std::string_view &engineFolder,
        const std::string_view &executorWorker
 ) : TensorRtLlmBackend(engineFolder, executorWorker) {}
 uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
        rust::Slice<const uint32_t> tokens,
        uint32_t maxNewTokens,
        int32_t topK,
        float_t topP,
        float_t temperature,
        float_t repetition_penalty,
        float_t frequency_penalty,
        uint64_t seed) {
    // This will copy all the items from the initial slice
    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
    return TensorRtLlmBackend::Submit(
            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
 }
 std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
 huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
    const auto responses = TensorRtLlmBackend::PullNewTokens();
    auto steps = std::make_unique<std::vector<GenerationStep>>();
    steps->reserve(responses.size());
 #ifndef NDEBUG
    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
 #endif
    // Transform tle::Response to GenerationStep
    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
        const auto reqId = r.getRequestId();
        if (!r.hasError()) {
            const auto result = r.getResult();
            return GenerationStep{
                    reqId,
                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
                    result.logProbs.value()[0][0],
                    result.isFinal,
                    false,
                    std::string()
            };
        } else {
            return GenerationStep{
                    reqId,
                    0,
                    0.0,
                    true,
                    true,
                    std::move(r.getErrorMsg())
            };
        }
    });
    return steps;
 }
 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
 huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
    SPDLOG_INFO("Creating TensorRT-LLM Backend");
    // Unconditionally call this to initialize and discover TRTLLM plugins
    InitializeBackend();
    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
 }
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -4,10 +4,11 @@ pub mod errors;
 mod looper;
 mod utils;
-#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
    /// Struct used as shared type between rust and C++ to represent the result
    /// of a single decoding iteration
    #[cxx_name = "generation_step_t"]
    #[derive(Debug, Clone)]
    pub struct GenerationStep {
        request_id: u64,
@ -19,9 +20,10 @@ mod ffi {
    }
    unsafe extern "C++" {
-        include!("backends/trtllm/src/ffi.cpp");
+        include!("backends/trtllm/csrc/ffi.hpp");
        /// Represent an instance of the underlying TensorRT-LLM backend
        #[cxx_name = "tensorrt_llm_backend_t"]
        type TensorRtLlmBackendImpl;
        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@ -38,21 +40,18 @@ mod ffi {
        /// ```
        ///
        /// ```
-        #[rust_name = "create_tensorrt_llm_backend"]
+        fn create_backend_from_engine_folder(
        fn CreateTensorRtLlmBackend(
            engine_folder: &str,
            executor_worker: &str,
        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
-        #[rust_name = "num_responses_ready"]
+        fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
-        #[rust_name = "submit"]
+        fn submit(
        fn Submit(
            self: Pin<&mut TensorRtLlmBackendImpl>,
            tokens: &[u32],
            max_new_tokens: u32,
-            top_k: i32,
+            top_k: u32,
            top_p: f32,
            temperature: f32,
            repetition_penalty: f32,
@ -60,8 +59,7 @@ mod ffi {
            seed: u64,
        ) -> Result<u64>;
-        #[rust_name = "pull_tokens"]
+        fn pull_tokens(
        fn PullTokens(
            self: Pin<&mut TensorRtLlmBackendImpl>,
        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
    }
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
 use text_generation_router::{FinishReason, Token};
 use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
 use crate::utils::first_line;
 type InferResult<T> = Result<T, InferError>;
@ -93,7 +93,7 @@ fn executor_status_looper(
                match backend.pin_mut().submit(
                    &input_ids.unwrap(), // This is checked beforehand in validate()
                    stopping_params.max_new_tokens,
-                    generation_params.top_k as i32,
+                    generation_params.top_k,
                    generation_params.top_p,
                    generation_params.temperature,
                    generation_params.repetition_penalty,
@ -120,7 +120,7 @@ fn executor_status_looper(
            }
        }
-        if backend.num_responses_ready() > 0 {
+        if backend.num_tokens_ready() > 0 {
            match backend.pin_mut().pull_tokens() {
                Ok(responses) => {
                    // Iterate through all the decoded token
@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 {
        let (post_processor_sender, post_processor_receiver) = unbounded_channel();
        // Create the FFI backend
-        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
        // Executor looper is responsible for scheduling and pulling requests state at regular interval