From f7bd82a90ef02048b319c87e5bcc46e22f5260f5 Mon Sep 17 00:00:00 2001 From: Tzu-Yu Lee Date: Sun, 25 May 2025 17:40:45 +0800 Subject: [PATCH] feat(trtllm): get more accurate start time Get a more accurate inference start time from the trtllm response. Because `Instant` does not expose absolute value, create reference points on both sides and return duration relative to the reference point instead. --- backends/trtllm/csrc/backend.cpp | 9 +++++++- backends/trtllm/csrc/ffi.hpp | 36 ++++++++++++++++++++++++++------ backends/trtllm/src/lib.rs | 4 ++++ backends/trtllm/src/looper.rs | 35 ++++++++++++++++++++++++------- 4 files changed, 70 insertions(+), 14 deletions(-) diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp index 2151466b..4a131e31 100644 --- a/backends/trtllm/csrc/backend.cpp +++ b/backends/trtllm/csrc/backend.cpp @@ -59,7 +59,14 @@ namespace huggingface::tgi::backends::trtllm { static_cast(g_params.max_new_tokens), true, (tle::SamplingConfig) s_params, - tle::OutputConfig{ /* returnLogProbs= */ true}, + tle::OutputConfig{ + /* returnLogProbs= */ true, + false, + false, + false, + false, + /* returnPerfMetrics=*/ true, + }, std::nullopt, std::nullopt, std::nullopt, diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index d31743ee..624259cf 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -1,6 +1,7 @@ #ifndef TGI_BACKEND_TRTLLM_FFI #define TGI_BACKEND_TRTLLM_FFI +#include #include #include #include @@ -52,7 +53,7 @@ namespace huggingface::tgi::backends::trtllm { } } - static auto as_generation_step = [](const tle::Response &r) { + static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point created) { const auto reqId = r.getRequestId(); if (!r.hasError()) [[likely]] { const auto result = r.getResult(); @@ -66,14 +67,23 @@ namespace huggingface::tgi::backends::trtllm { log_prob = result.logProbs.value()[0].back(); } + std::optional first_scheduled_time_ns = std::nullopt; + if (result.requestPerfMetrics) { + const auto &t = result.requestPerfMetrics->timingMetrics; + const auto ns = std::chrono::duration_cast(t.firstScheduledTime - created).count(); + first_scheduled_time_ns = static_cast(ns); + } + return generation_step_t{ reqId, token_id.value_or(0), log_prob.value_or(0.0), + first_scheduled_time_ns.value_or(0), result.isFinal, as_finish_reason_t(result.finishReasons[0]), token_id.has_value(), log_prob.has_value(), + first_scheduled_time_ns.has_value(), false, std::string() }; @@ -82,10 +92,12 @@ namespace huggingface::tgi::backends::trtllm { reqId, 0, 0.0, + 0, true, finish_reason_t::kNOT_FINISHED, false, false, + false, true, std::move(r.getErrorMsg()) }; @@ -97,9 +109,16 @@ namespace huggingface::tgi::backends::trtllm { private: backend_t inner_; + // m_created_time is a reference point to convert time from c++ time_point + // to rust Instant. + std::chrono::time_point m_created_time; + + public: - tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path) - : inner_(engine_folder, executor_worker_path) {} + tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point& created_time) + : inner_(engine_folder, executor_worker_path), + m_created_time {created_time} + {} size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); } @@ -139,13 +158,16 @@ namespace huggingface::tgi::backends::trtllm { SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size()); + auto f = [this](const tle::Response &r){ + return as_generation_step(r, m_created_time); + }; // Transform tle::Response to generation_step_t #ifdef __cpp_lib_ranges_to_container - auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to(); + auto steps = responses | std::views::transform(f) | std::ranges::to(); #else auto steps = std::vector(); steps.reserve(responses.size()); - std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step); + std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f); #endif return std::make_unique>(steps); @@ -197,12 +219,14 @@ namespace huggingface::tgi::backends::trtllm { std::unique_ptr create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) { + const auto created_time = std::chrono::steady_clock::now(); std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend); return std::make_unique( std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format), std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), - std::filesystem::path::format::auto_format) + std::filesystem::path::format::auto_format), + created_time ); } } diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index b2a9274d..3a245151 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -42,10 +42,14 @@ mod ffi { request_id: u64, token_id: u32, log_prob: f32, + + /// The time of first schedule since the creation of the backend + first_scheduled_time_ns: i64, is_final: bool, finish_reason: FinishReason, token_id_valid: bool, log_prob_valid: bool, + first_scheduled_time_ns_valid: bool, has_error: bool, error_msg: String, } diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs index a4b70ea9..34fcf34e 100644 --- a/backends/trtllm/src/looper.rs +++ b/backends/trtllm/src/looper.rs @@ -8,7 +8,7 @@ use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::TryAcquireError; use tokio::task::spawn_blocking; -use tokio::time::Instant; +use tokio::time::{Duration, Instant}; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, warn}; @@ -82,6 +82,7 @@ fn executor_status_looper( tokenizer: Tokenizer, mut backend: UniquePtr, mut backlog: UnboundedReceiver, + created_time: Instant, ) { // Track the tuple (request_id, stream) for each request let mut in_flights = @@ -144,12 +145,22 @@ fn executor_status_looper( for step in responses.deref() { if let Some(ctx) = in_flights.get_mut(&step.request_id) { // Update the starting timestamp if not set - // This value might not be the actual real starting time of the request - // on the executor side - Need to expose more info from the executor to - // retrieve this value - // TODO : Expose actual real starting time for a request on FFI layer if ctx.start.is_none() { - ctx.start = Some(Instant::now()); + if step.first_scheduled_time_ns_valid { + if step.first_scheduled_time_ns >= 0 { + ctx.start = created_time.checked_add(Duration::from_nanos( + step.first_scheduled_time_ns as u64, + )); + } else { + ctx.start = created_time.checked_sub(Duration::from_nanos( + -step.first_scheduled_time_ns as u64, + )); + } + } + + if ctx.start.is_none() { + ctx.start = Some(Instant::now()); + } } // Try to map the generation step to a DecodedToken @@ -348,13 +359,23 @@ impl TensorRtLlmBackendV2 { // Allocate the IPC layer to communicate with the backend let (executor_sender, executor_receiver) = unbounded_channel(); + // This is a reference point to convert time from c++ time_point + // to rust Instant. + let created_time = Instant::now(); + // Create the FFI backend let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path) .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; // Executor looper is responsible for scheduling and pulling requests state at regular interval spawn_blocking(move || { - executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver) + executor_status_looper( + max_inflight_requests, + tokenizer, + backend, + executor_receiver, + created_time, + ) }); Ok(TensorRtLlmBackendV2(executor_sender))