From f7bd82a90ef02048b319c87e5bcc46e22f5260f5 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 25 May 2025 17:40:45 +0800
Subject: [PATCH] feat(trtllm): get more accurate start time

Get a more accurate inference start time from the trtllm response.
Because `Instant` does not expose absolute value, create reference
points on both sides and return duration relative to the reference
point instead.
---
 backends/trtllm/csrc/backend.cpp |  9 +++++++-
 backends/trtllm/csrc/ffi.hpp     | 36 ++++++++++++++++++++++++++------
 backends/trtllm/src/lib.rs       |  4 ++++
 backends/trtllm/src/looper.rs    | 35 ++++++++++++++++++++++++-------
 4 files changed, 70 insertions(+), 14 deletions(-)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index 2151466b..4a131e31 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -59,7 +59,14 @@ namespace huggingface::tgi::backends::trtllm {
                 static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
                 (tle::SamplingConfig) s_params,
-                tle::OutputConfig{ /* returnLogProbs= */ true},
+                tle::OutputConfig{ 
+                    /* returnLogProbs= */ true,
+                    false,
+                    false,
+                    false,
+                    false,
+                    /* returnPerfMetrics=*/ true,
+                },
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index d31743ee..624259cf 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,6 +1,7 @@
 #ifndef TGI_BACKEND_TRTLLM_FFI
 #define TGI_BACKEND_TRTLLM_FFI
 
+#include <chrono>
 #include <exception>
 #include <memory>
 #include <thread>
@@ -52,7 +53,7 @@ namespace huggingface::tgi::backends::trtllm {
         }
     }
 
-    static auto as_generation_step = [](const tle::Response &r) {
+    static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
         const auto reqId = r.getRequestId();
         if (!r.hasError()) [[likely]] {
             const auto result = r.getResult();
@@ -66,14 +67,23 @@ namespace huggingface::tgi::backends::trtllm {
                 log_prob = result.logProbs.value()[0].back();
             }
 
+            std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
+            if (result.requestPerfMetrics) {
+                const auto &t = result.requestPerfMetrics->timingMetrics;
+                const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
+                first_scheduled_time_ns = static_cast<int64_t>(ns);
+            }
+
             return generation_step_t{
                     reqId,
                     token_id.value_or(0),
                     log_prob.value_or(0.0),
+                    first_scheduled_time_ns.value_or(0),
                     result.isFinal,
                     as_finish_reason_t(result.finishReasons[0]),
                     token_id.has_value(),
                     log_prob.has_value(),
+                    first_scheduled_time_ns.has_value(),
                     false,
                     std::string()
             };
@@ -82,10 +92,12 @@ namespace huggingface::tgi::backends::trtllm {
                     reqId,
                     0,
                     0.0,
+                    0,
                     true,
                     finish_reason_t::kNOT_FINISHED,
                     false,
                     false,
+                    false,
                     true,
                     std::move(r.getErrorMsg())
             };
@@ -97,9 +109,16 @@ namespace huggingface::tgi::backends::trtllm {
     private:
         backend_t inner_;
 
+        // m_created_time is a reference point to convert time from c++ time_point
+        // to rust Instant.
+        std::chrono::time_point<std::chrono::steady_clock> m_created_time;
+
+
     public:
-        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-                : inner_(engine_folder, executor_worker_path) {}
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time)
+                : inner_(engine_folder, executor_worker_path),
+                  m_created_time {created_time}
+        {}
 
         size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
 
@@ -139,13 +158,16 @@ namespace huggingface::tgi::backends::trtllm {
 
                 SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
 
+                auto f = [this](const tle::Response &r){
+                    return as_generation_step(r, m_created_time);
+                };
                 // Transform tle::Response to generation_step_t
 #ifdef __cpp_lib_ranges_to_container
-                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
+                auto steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
 #else
                 auto steps = std::vector<generation_step_t>();
                 steps.reserve(responses.size());
-                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
+                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
 #endif
                 return std::make_unique<std::vector<generation_step_t>>(steps);
 
@@ -197,12 +219,14 @@ namespace huggingface::tgi::backends::trtllm {
 
     std::unique_ptr<tensorrt_llm_backend_t>
     create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+        const auto created_time = std::chrono::steady_clock::now();
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
         return std::make_unique<tensorrt_llm_backend_t>(
                 std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
                                       std::filesystem::path::format::auto_format),
                 std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
-                                      std::filesystem::path::format::auto_format)
+                                      std::filesystem::path::format::auto_format),
+                created_time
         );
     }
 }
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index b2a9274d..3a245151 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -42,10 +42,14 @@ mod ffi {
         request_id: u64,
         token_id: u32,
         log_prob: f32,
+
+        /// The time of first schedule since the creation of the backend
+        first_scheduled_time_ns: i64,
         is_final: bool,
         finish_reason: FinishReason,
         token_id_valid: bool,
         log_prob_valid: bool,
+        first_scheduled_time_ns_valid: bool,
         has_error: bool,
         error_msg: String,
     }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index a4b70ea9..34fcf34e 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -8,7 +8,7 @@ use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
 use tokio::sync::TryAcquireError;
 use tokio::task::spawn_blocking;
-use tokio::time::Instant;
+use tokio::time::{Duration, Instant};
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, warn};
 
@@ -82,6 +82,7 @@ fn executor_status_looper(
     tokenizer: Tokenizer,
     mut backend: UniquePtr<TensorRtLlmBackendImpl>,
     mut backlog: UnboundedReceiver<GenerationContext>,
+    created_time: Instant,
 ) {
     // Track the tuple (request_id, stream) for each request
     let mut in_flights =
@@ -144,12 +145,22 @@ fn executor_status_looper(
                     for step in responses.deref() {
                         if let Some(ctx) = in_flights.get_mut(&step.request_id) {
                             // Update the starting timestamp if not set
-                            // This value might not be the actual real starting time of the request
-                            // on the executor side - Need to expose more info from the executor to
-                            // retrieve this value
-                            // TODO : Expose actual real starting time for a request on FFI layer
                             if ctx.start.is_none() {
-                                ctx.start = Some(Instant::now());
+                                if step.first_scheduled_time_ns_valid {
+                                    if step.first_scheduled_time_ns >= 0 {
+                                        ctx.start = created_time.checked_add(Duration::from_nanos(
+                                            step.first_scheduled_time_ns as u64,
+                                        ));
+                                    } else {
+                                        ctx.start = created_time.checked_sub(Duration::from_nanos(
+                                            -step.first_scheduled_time_ns as u64,
+                                        ));
+                                    }
+                                }
+
+                                if ctx.start.is_none() {
+                                    ctx.start = Some(Instant::now());
+                                }
                             }
 
                             // Try to map the generation step to a DecodedToken
@@ -348,13 +359,23 @@ impl TensorRtLlmBackendV2 {
         // Allocate the IPC layer to communicate with the backend
         let (executor_sender, executor_receiver) = unbounded_channel();
 
+        // This is a reference point to convert time from c++ time_point
+        // to rust Instant.
+        let created_time = Instant::now();
+
         // Create the FFI backend
         let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
             .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval
         spawn_blocking(move || {
-            executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
+            executor_status_looper(
+                max_inflight_requests,
+                tokenizer,
+                backend,
+                executor_receiver,
+                created_time,
+            )
         });
 
         Ok(TensorRtLlmBackendV2(executor_sender))