diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index fe60769fe..0a0f6e6bf 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -4,7 +4,7 @@ use std::env; use std::env::consts::ARCH; use std::path::{absolute, PathBuf}; -const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"]; const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); const CUDA_REQUIRED_VERSION: &str = "12.6"; const MPI_REQUIRED_VERSION: &str = "4.1"; @@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { cxx_build::bridge("src/lib.rs") .static_flag(true) .std("c++23") - .include(deps_folder.join("fmt-src").join("include")) .include(deps_folder.join("spdlog-src").join("include")) .include(deps_folder.join("json-src").join("include")) .include(deps_folder.join("trtllm-src").join("cpp").join("include")) @@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { println!("cargo:rerun-if-changed=CMakeLists.txt"); println!("cargo:rerun-if-changed=cmake/trtllm.cmake"); println!("cargo:rerun-if-changed=cmake/json.cmake"); - println!("cargo:rerun-if-changed=cmake/fmt.cmake"); println!("cargo:rerun-if-changed=cmake/spdlog.cmake"); println!("cargo:rerun-if-changed=csrc/backend.hpp"); println!("cargo:rerun-if-changed=csrc/backend.cpp"); diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp index 362fe8783..e593f4396 100644 --- a/backends/trtllm/csrc/backend.cpp +++ b/backends/trtllm/csrc/backend.cpp @@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm { } std::expected - backend_t::submit(const std::span token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept { + backend_t::submit(std::span token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept { SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params); return executor_.enqueueRequest(tle::Request { {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp index c8f8f21cd..376513589 100644 --- a/backends/trtllm/csrc/backend.hpp +++ b/backends/trtllm/csrc/backend.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include #include @@ -17,7 +18,7 @@ namespace huggingface::tgi::backends::trtllm { namespace tle = tensorrt_llm::executor; using json = nlohmann::json; - using request_id_t = uint32_t; + using request_id_t = uint64_t; using token_id_t = tle::TokenIdType; /** @@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm { float_t top_p; float_t repetition_penalty; float_t frequency_penalty; - float_t length_penalty; float_t temperature; uint64_t seed; @@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm { repetition_penalty, std::nullopt, frequency_penalty, - length_penalty + std::nullopt }; } }; @@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm { */ [[nodiscard("Discarded executor request_id needs to be assigned")]] std::expected - submit(std::span token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept; + submit(std::span token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept; /** * Query the number of tokens available across all in-flight generations @@ -216,8 +216,8 @@ template <> struct fmt::formatter format_context::iterator { return fmt::format_to( ctx.out(), - "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}", - c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed + "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}", + c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed ); } }; \ No newline at end of file diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index b3f20b838..dc9fdd0fb 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -1,9 +1,18 @@ +#ifndef TGI_BACKEND_TRTLLM_FFI +#define TGI_BACKEND_TRTLLM_FFI + #include #include #include #include +#include +#include +#include + +#include + namespace rust::behavior { template static void trycatch(Try &&func, Fail &&fail) noexcept try { @@ -13,11 +22,11 @@ namespace rust::behavior { } } -#include -#include -#include +namespace huggingface::tgi::backends::trtllm { + class tensorrt_llm_backend_t; +} -#include +#include "backends/trtllm/src/lib.rs.h" namespace huggingface::tgi::backends::trtllm { std::once_flag backend_initialized_flag; @@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm { SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor")); // Submit the request to the executor and get back a potential request_id used to track request status + const auto signed_tokens = std::vector(tokens.begin(), tokens.end()); const auto maybe_request_id = inner_.submit( - {tokens.data(), tokens.size()}, + signed_tokens, {max_new_tokens}, {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed} ); @@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm { } } + std::unique_ptr> pull_tokens() noexcept { + if(num_tokens_ready() > 0) [[likely]] { + const auto responses = inner_.pull_tokens(); + + SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size()); + // Transform tle::Response to GenerationStep + auto steps = std::make_unique>(); + std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { + const auto reqId = r.getRequestId(); + if (!r.hasError()) [[likely]] { + const auto result = r.getResult(); + return generation_step_t{ + reqId, + static_cast(result.outputTokenIds[0][0]), + result.logProbs.value()[0][0], + result.isFinal, + false, + std::string() + }; + } else { + return generation_step_t{ + reqId, + 0, + 0.0, + true, + true, + std::move(r.getErrorMsg()) + }; + } + }); + return steps; + + } else { + return std::make_unique>(); + } + } + void cancel(request_id_t requestId) noexcept { SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId); inner_.cancel(requestId); @@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm { } } - std::unique_ptr create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) { + std::unique_ptr create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) { std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend); return std::make_unique( std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format), @@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm { ); } } +#endif \ No newline at end of file diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp deleted file mode 100644 index 0a92c050f..000000000 --- a/backends/trtllm/src/ffi.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// -// Created by mfuntowicz on 6/30/24. -// -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "backends/trtllm/include/ffi.h" - - -huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl( - const std::string_view &engineFolder, - const std::string_view &executorWorker -) : TensorRtLlmBackend(engineFolder, executorWorker) {} - - -uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( - rust::Slice tokens, - uint32_t maxNewTokens, - int32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed) { - - // This will copy all the items from the initial slice - std::vector tokens_(tokens.begin(), tokens.end()); - return TensorRtLlmBackend::Submit( - std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); -} - -std::unique_ptr> -huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { - const auto responses = TensorRtLlmBackend::PullNewTokens(); - - auto steps = std::make_unique>(); - steps->reserve(responses.size()); - -#ifndef NDEBUG - SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); -#endif - - // Transform tle::Response to GenerationStep - std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { - const auto reqId = r.getRequestId(); - if (!r.hasError()) { - const auto result = r.getResult(); - return GenerationStep{ - reqId, - static_cast(result.outputTokenIds[0][0]), - result.logProbs.value()[0][0], - result.isFinal, - false, - std::string() - }; - } else { - return GenerationStep{ - reqId, - 0, - 0.0, - true, - true, - std::move(r.getErrorMsg()) - }; - } - }); - - return steps; -} - -std::unique_ptr -huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) { - SPDLOG_INFO("Creating TensorRT-LLM Backend"); - // Unconditionally call this to initialize and discover TRTLLM plugins - InitializeBackend(); - - const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end()); - const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end()); - return std::make_unique(std::move(enginePath), std::move(executorPath)); -} diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index edd8caff1..7b60593df 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -4,10 +4,11 @@ pub mod errors; mod looper; mod utils; -#[cxx::bridge(namespace = "huggingface::tgi::backends")] +#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")] mod ffi { /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration + #[cxx_name = "generation_step_t"] #[derive(Debug, Clone)] pub struct GenerationStep { request_id: u64, @@ -19,9 +20,10 @@ mod ffi { } unsafe extern "C++" { - include!("backends/trtllm/src/ffi.cpp"); + include!("backends/trtllm/csrc/ffi.hpp"); /// Represent an instance of the underlying TensorRT-LLM backend + #[cxx_name = "tensorrt_llm_backend_t"] type TensorRtLlmBackendImpl; /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend @@ -38,21 +40,18 @@ mod ffi { /// ``` /// /// ``` - #[rust_name = "create_tensorrt_llm_backend"] - fn CreateTensorRtLlmBackend( + fn create_backend_from_engine_folder( engine_folder: &str, executor_worker: &str, ) -> Result>; - #[rust_name = "num_responses_ready"] - fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize; + fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize; - #[rust_name = "submit"] - fn Submit( + fn submit( self: Pin<&mut TensorRtLlmBackendImpl>, tokens: &[u32], max_new_tokens: u32, - top_k: i32, + top_k: u32, top_p: f32, temperature: f32, repetition_penalty: f32, @@ -60,8 +59,7 @@ mod ffi { seed: u64, ) -> Result; - #[rust_name = "pull_tokens"] - fn PullTokens( + fn pull_tokens( self: Pin<&mut TensorRtLlmBackendImpl>, ) -> Result>>; } diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index e26155c16..8e9ff49d5 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest}; use text_generation_router::{FinishReason, Token}; use crate::errors::TensorRtLlmBackendError; -use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; +use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl}; use crate::utils::first_line; type InferResult = Result; @@ -93,7 +93,7 @@ fn executor_status_looper( match backend.pin_mut().submit( &input_ids.unwrap(), // This is checked beforehand in validate() stopping_params.max_new_tokens, - generation_params.top_k as i32, + generation_params.top_k, generation_params.top_p, generation_params.temperature, generation_params.repetition_penalty, @@ -120,7 +120,7 @@ fn executor_status_looper( } } - if backend.num_responses_ready() > 0 { + if backend.num_tokens_ready() > 0 { match backend.pin_mut().pull_tokens() { Ok(responses) => { // Iterate through all the decoded token @@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 { let (post_processor_sender, post_processor_receiver) = unbounded_channel(); // Create the FFI backend - let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) + let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path) .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; // Executor looper is responsible for scheduling and pulling requests state at regular interval