feat(backend): impl missing generation_step_t as return value of pull_tokens

2025-09-14 05:44:52 +00:00 · 2024-12-02 23:28:25 +01:00 · 2024-12-02 23:28:25 +01:00 · 2f8634ec01
commit 2f8634ec01
parent a7bad25c41
7 changed files with 75 additions and 120 deletions
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@ -4,7 +4,7 @@ use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};

-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
 const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
        .std("c++23")
-        .include(deps_folder.join("fmt-src").join("include"))
        .include(deps_folder.join("spdlog-src").join("include"))
        .include(deps_folder.join("json-src").join("include"))
        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    println!("cargo:rerun-if-changed=CMakeLists.txt");
    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
    println!("cargo:rerun-if-changed=cmake/json.cmake");
-    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
    println!("cargo:rerun-if-changed=csrc/backend.hpp");
    println!("cargo:rerun-if-changed=csrc/backend.cpp");
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
    }

    std::expected<request_id_t, backend_error_t>
-    backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
        return executor_.enqueueRequest(tle::Request {
                {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@ -1,3 +1,4 @@
+#pragma once
 #include <cmath>
 #include <cstdint>
 #include <exception>
@ -17,7 +18,7 @@
 namespace huggingface::tgi::backends::trtllm {
    namespace tle = tensorrt_llm::executor;
    using json = nlohmann::json;
-    using request_id_t = uint32_t;
+    using request_id_t = uint64_t;
    using token_id_t = tle::TokenIdType;

    /**
@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm {
        float_t top_p;
        float_t repetition_penalty;
        float_t frequency_penalty;
-        float_t length_penalty;
        float_t temperature;
        uint64_t seed;

@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm {
                repetition_penalty,
                std::nullopt,
                frequency_penalty,
-                length_penalty
+                std::nullopt
            };
        }
    };
@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm {
         */
        [[nodiscard("Discarded executor request_id needs to be assigned")]]
        std::expected<request_id_t, backend_error_t>
-        submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
+        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;

        /**
         * Query the number of tokens available across all in-flight generations
@ -216,8 +216,8 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
        return fmt::format_to(
                ctx.out(),
-                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
-                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
+                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
+                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
        );
    }
 };
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@ -1,9 +1,18 @@
+#ifndef TGI_BACKEND_TRTLLM_FFI
+#define TGI_BACKEND_TRTLLM_FFI
+
 #include <memory>
 #include <thread>

 #include <tensorrt_llm/common/tllmException.h>
 #include <tensorrt_llm/plugins/api/tllmPlugin.h>

+#include <spdlog/spdlog.h>
+#include <spdlog/pattern_formatter.h>
+#include <spdlog/fmt/fmt.h>
+
+#include <backend.hpp>
+
 namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
@ -13,11 +22,11 @@ namespace rust::behavior {
    }
 }

-#include <spdlog/spdlog.h>
-#include <spdlog/pattern_formatter.h>
-#include <spdlog/fmt/fmt.h>
+namespace huggingface::tgi::backends::trtllm {
+    class tensorrt_llm_backend_t;
+}

-#include <backend.hpp>
+#include "backends/trtllm/src/lib.rs.h"

 namespace huggingface::tgi::backends::trtllm {
    std::once_flag backend_initialized_flag;
@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm {
            SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));

            // Submit the request to the executor and get back a potential request_id used to track request status
+            const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
            const auto maybe_request_id = inner_.submit(
-                {tokens.data(), tokens.size()},
+                signed_tokens,
                {max_new_tokens},
                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
            );
@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm {
            }
        }

+        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
+            if(num_tokens_ready() > 0) [[likely]] {
+                const auto responses = inner_.pull_tokens();
+
+                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
+                // Transform tle::Response to GenerationStep
+                auto steps = std::make_unique<std::vector<generation_step_t>>();
+                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
+                    const auto reqId = r.getRequestId();
+                    if (!r.hasError()) [[likely]] {
+                        const auto result = r.getResult();
+                        return generation_step_t{
+                                reqId,
+                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                                result.logProbs.value()[0][0],
+                                result.isFinal,
+                                false,
+                                std::string()
+                        };
+                    } else {
+                        return generation_step_t{
+                                reqId,
+                                0,
+                                0.0,
+                                true,
+                                true,
+                                std::move(r.getErrorMsg())
+                        };
+                    }
+                });
+                return steps;
+
+            } else {
+                return std::make_unique<std::vector<generation_step_t>>();
+            }
+        }
+
        void cancel(request_id_t requestId) noexcept {
            SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
            inner_.cancel(requestId);
@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm {
        }
    }

-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
        std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
        return std::make_unique<tensorrt_llm_backend_t>(
            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
        );
    }
 }
+#endif
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@ -1,89 +0,0 @@
-//
-// Created by mfuntowicz on 6/30/24.
-//
-#pragma once
-
-#include <algorithm>
-#include <exception>
-#include <filesystem>
-#include <functional>
-#include <limits>
-#include <iterator>
-#include <ranges>
-#include <vector>
-
-#include <spdlog/spdlog.h>
-#include "backends/trtllm/include/ffi.h"
-
-
-huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
-        const std::string_view &engineFolder,
-        const std::string_view &executorWorker
-) : TensorRtLlmBackend(engineFolder, executorWorker) {}
-
-
-uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
-        rust::Slice<const uint32_t> tokens,
-        uint32_t maxNewTokens,
-        int32_t topK,
-        float_t topP,
-        float_t temperature,
-        float_t repetition_penalty,
-        float_t frequency_penalty,
-        uint64_t seed) {
-
-    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
-    return TensorRtLlmBackend::Submit(
-            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
-}
-
-std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
-huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
-    const auto responses = TensorRtLlmBackend::PullNewTokens();
-
-    auto steps = std::make_unique<std::vector<GenerationStep>>();
-    steps->reserve(responses.size());
-
-#ifndef NDEBUG
-    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
-#endif
-
-    // Transform tle::Response to GenerationStep
-    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-        const auto reqId = r.getRequestId();
-        if (!r.hasError()) {
-            const auto result = r.getResult();
-            return GenerationStep{
-                    reqId,
-                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                    result.logProbs.value()[0][0],
-                    result.isFinal,
-                    false,
-                    std::string()
-            };
-        } else {
-            return GenerationStep{
-                    reqId,
-                    0,
-                    0.0,
-                    true,
-                    true,
-                    std::move(r.getErrorMsg())
-            };
-        }
-    });
-
-    return steps;
-}
-
-std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
-huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
-    SPDLOG_INFO("Creating TensorRT-LLM Backend");
-    // Unconditionally call this to initialize and discover TRTLLM plugins
-    InitializeBackend();
-
-    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
-    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
-}
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@ -4,10 +4,11 @@ pub mod errors;
 mod looper;
 mod utils;

-#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
    /// Struct used as shared type between rust and C++ to represent the result
    /// of a single decoding iteration
+    #[cxx_name = "generation_step_t"]
    #[derive(Debug, Clone)]
    pub struct GenerationStep {
        request_id: u64,
@ -19,9 +20,10 @@ mod ffi {
    }

    unsafe extern "C++" {
-        include!("backends/trtllm/src/ffi.cpp");
+        include!("backends/trtllm/csrc/ffi.hpp");

        /// Represent an instance of the underlying TensorRT-LLM backend
+        #[cxx_name = "tensorrt_llm_backend_t"]
        type TensorRtLlmBackendImpl;

        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@ -38,21 +40,18 @@ mod ffi {
        /// ```
        ///
        /// ```
-        #[rust_name = "create_tensorrt_llm_backend"]
-        fn CreateTensorRtLlmBackend(
+        fn create_backend_from_engine_folder(
            engine_folder: &str,
            executor_worker: &str,
        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;

-        #[rust_name = "num_responses_ready"]
-        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+        fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;

-        #[rust_name = "submit"]
-        fn Submit(
+        fn submit(
            self: Pin<&mut TensorRtLlmBackendImpl>,
            tokens: &[u32],
            max_new_tokens: u32,
-            top_k: i32,
+            top_k: u32,
            top_p: f32,
            temperature: f32,
            repetition_penalty: f32,
@ -60,8 +59,7 @@ mod ffi {
            seed: u64,
        ) -> Result<u64>;

-        #[rust_name = "pull_tokens"]
-        fn PullTokens(
+        fn pull_tokens(
            self: Pin<&mut TensorRtLlmBackendImpl>,
        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
    }
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
 use text_generation_router::{FinishReason, Token};

 use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
 use crate::utils::first_line;

 type InferResult<T> = Result<T, InferError>;
@ -93,7 +93,7 @@ fn executor_status_looper(
                match backend.pin_mut().submit(
                    &input_ids.unwrap(), // This is checked beforehand in validate()
                    stopping_params.max_new_tokens,
-                    generation_params.top_k as i32,
+                    generation_params.top_k,
                    generation_params.top_p,
                    generation_params.temperature,
                    generation_params.repetition_penalty,
@ -120,7 +120,7 @@ fn executor_status_looper(
            }
        }

-        if backend.num_responses_ready() > 0 {
+        if backend.num_tokens_ready() > 0 {
            match backend.pin_mut().pull_tokens() {
                Ok(responses) => {
                    // Iterate through all the decoded token
@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 {
        let (post_processor_sender, post_processor_receiver) = unbounded_channel();

        // Create the FFI backend
-        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;

        // Executor looper is responsible for scheduling and pulling requests state at regular interval