From c458d21d077d28bff36565204732405a93d3661d Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 11 May 2025 03:27:59 +0800
Subject: [PATCH 1/9] feat(trtllm): add new finish reasons

Add new finish reasons introduced in TensorRT-LLM v0.16.0.
---
 backends/trtllm/csrc/ffi.hpp | 4 ++++
 backends/trtllm/src/lib.rs   | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index 840614bb..a877df5a 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -42,6 +42,10 @@ namespace huggingface::tgi::backends::trtllm {
                 return finish_reason_t::kEND_ID;
             case tle::FinishReason::kLENGTH:
                 return finish_reason_t::kLENGTH;
+            case tle::FinishReason::kTIMED_OUT:
+                return finish_reason_t::kTIMED_OUT;
+            case tle::FinishReason::kCANCELLED:
+                return finish_reason_t::kCANCELLED;
             default:
                 std::unreachable();
         }
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index 08507256..52e48f91 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -24,6 +24,14 @@ mod ffi {
         /// The request finished because the maximum number of tokens was reached.
         #[cxx_name = "kLENGTH"]
         MaxLength = 3u8,
+
+        #[cxx_name = "kTIMED_OUT"]
+        /// The request finished because it got timed out (via the mAllotedTime parameter)
+        TimedOut = 4u8,
+
+        #[cxx_name = "kCANCELLED"]
+        /// The request was cancelled by calling cancelRequest.
+        Cancelled = 5u8,
     }
 
     /// Struct used as shared type between rust and C++ to represent the result

From cc4b5848b98975093d3ff4d937df381345cf0ea3 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Tue, 13 May 2025 00:05:56 +0800
Subject: [PATCH 2/9] fix: fix prometheus_port CLI short arg conflict

The short arg of `prometheus_port` conflicts with `port`. Remove the
short arg variant.

Fixes https://github.com/huggingface/text-generation-inference/issues/3205
---
 backends/llamacpp/src/main.rs     | 2 +-
 backends/trtllm/src/main.rs       | 2 +-
 backends/v2/src/main.rs           | 2 +-
 backends/v3/src/main.rs           | 2 +-
 docs/source/reference/launcher.md | 2 +-
 launcher/src/main.rs              | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 9ee61ce6..c193c868 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -119,7 +119,7 @@ struct Args {
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
 
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
 
     /// Enable JSON output format.
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 543f8e6e..81fca0e7 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -37,7 +37,7 @@ struct Args {
     hostname: String,
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
     #[clap(long, env, required = true)]
     tokenizer_name: String,
diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
index 60b5d52b..a0f3558c 100644
--- a/backends/v2/src/main.rs
+++ b/backends/v2/src/main.rs
@@ -36,7 +36,7 @@ struct Args {
     hostname: String,
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
     #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
     master_shard_uds_path: String,
diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
index 44e63853..75a20691 100644
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@@ -36,7 +36,7 @@ struct Args {
     hostname: String,
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
     #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
     master_shard_uds_path: String,
diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index 51bd461f..c7ec90d9 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -254,7 +254,7 @@ Options:
 ```
 ## PROMETHEUS_PORT
 ```shell
-  -p, --prometheus-port <PROMETHEUS_PORT>
+  --prometheus-port <PROMETHEUS_PORT>
           The Prometheus port to listen on
           
           [env: PROMETHEUS_PORT=]
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index ee80eb00..dcb0e16d 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -774,7 +774,7 @@ struct Args {
     port: u16,
 
     /// The Prometheus port to listen on.
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
 
     /// The name of the socket for gRPC communication between the webserver

From 0858af206f61527be04ce38c7680a456f2885b71 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 18 May 2025 02:22:53 +0800
Subject: [PATCH 3/9] fix(trtllm): fix segfault when canceling request

When a request is cancelled, the `tensorrt_llm::executor::Result`
contains `outputTokenIds` with size 1, but `outputTokenIds[0]` has size
0. This causes `as_generation_step` to segfault.

Check the size of `outputTokenIds` and `logProbs` before attempting to
access the inner vector. The `finishReasons` can be skipped because it
has only one dimension and the minimum beam size is 1.
Because cxx have not added Option support yet, include two boolean flags
to denote whether the value is valid.

Change log level when request is cancelled to debug.
---
 backends/trtllm/csrc/ffi.hpp  | 19 +++++++++++++---
 backends/trtllm/src/lib.rs    |  2 ++
 backends/trtllm/src/looper.rs | 41 ++++++++++++++++++++++++++---------
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index a877df5a..90d1b9d1 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -55,13 +55,24 @@ namespace huggingface::tgi::backends::trtllm {
         const auto reqId = r.getRequestId();
         if (!r.hasError()) [[likely]] {
             const auto result = r.getResult();
-            const auto logits = result.logProbs.value()[0];
+            std::optional<uint32_t> token_id = std::nullopt;
+            if (!result.outputTokenIds.empty() && !result.outputTokenIds[0].empty()) {
+                token_id = static_cast<uint32_t>(result.outputTokenIds[0][0]);
+            }
+
+            std::optional<float> log_prob = std::nullopt;
+            if (result.logProbs && !result.logProbs->empty() && !result.logProbs.value()[0].empty()) {
+                log_prob = result.logProbs.value()[0].back();
+            }
+
             return generation_step_t{
                     reqId,
-                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                    logits.back(),
+                    token_id.value_or(0),
+                    log_prob.value_or(0.0),
                     result.isFinal,
                     as_finish_reason_t(result.finishReasons[0]),
+                    token_id.has_value(),
+                    log_prob.has_value(),
                     false,
                     std::string()
             };
@@ -72,6 +83,8 @@ namespace huggingface::tgi::backends::trtllm {
                     0.0,
                     true,
                     finish_reason_t::kNOT_FINISHED,
+                    false,
+                    false,
                     true,
                     std::move(r.getErrorMsg())
             };
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index 52e48f91..b2a9274d 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -44,6 +44,8 @@ mod ffi {
         log_prob: f32,
         is_final: bool,
         finish_reason: FinishReason,
+        token_id_valid: bool,
+        log_prob_valid: bool,
         has_error: bool,
         error_msg: String,
     }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 5fed954f..fd0bc967 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -49,16 +49,28 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
     type Error = InferError;
 
     fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
-        if !step.has_error {
-            Ok(Self {
-                id: step.token_id,
-                log_prob: step.log_prob,
-                is_final: step.is_final,
-                finish_reason: step.finish_reason,
-            })
-        } else {
-            Err(GenerationError(step.error_msg.clone()))
+        if step.has_error {
+            return Err(GenerationError(step.error_msg.clone()));
         }
+
+        if !step.token_id_valid {
+            return Err(GenerationError(
+                "GenerationStep contains no token_id".to_string(),
+            ));
+        }
+
+        if !step.log_prob_valid {
+            return Err(GenerationError(
+                "GenerationStep contains no log_prob".to_string(),
+            ));
+        }
+
+        Ok(Self {
+            id: step.token_id,
+            log_prob: step.log_prob,
+            is_final: step.is_final,
+            finish_reason: step.finish_reason,
+        })
     }
 }
 
@@ -151,7 +163,16 @@ fn executor_status_looper(
                                 let _ = in_flights.remove(&step.request_id);
                             }
                         } else {
-                            warn!("Untracked request {}", step.request_id,);
+                            match step.finish_reason {
+                                FinishReason::Cancelled => {
+                                    // The client has canceled the request, so this should not generate a
+                                    // warning.
+                                    debug!("Cancelled request {}", step.request_id);
+                                }
+                                _ => {
+                                    warn!("Untracked request {}", step.request_id);
+                                }
+                            }
                         }
                     }
                 }

From 27d03309c98dd9b4079a6c8a5addd56b9b6e345b Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 18 May 2025 02:37:19 +0800
Subject: [PATCH 4/9] feat(trtllm): add stop sequence support

Support per request stop sequences.
---
 backends/trtllm/src/looper.rs | 41 ++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index fd0bc967..6d7f30c3 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -35,6 +35,9 @@ struct GenerationContext {
     tokens: Vec<u32>,
     start: Option<Instant>,
     queued: Instant,
+
+    /// output_buffer stores the output for detecting stop sequences
+    output_buffer: Option<String>,
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -191,11 +194,39 @@ fn executor_status_looper(
 fn post_process_decoded_token(
     tokenizer: &Tokenizer,
     ctx: &mut GenerationContext,
-    decoded_token: DecodedToken,
+    mut decoded_token: DecodedToken,
 ) -> InferResult<InferStreamResponse> {
     match tokenizer.decode(&[decoded_token.id], false) {
         Ok(text) => {
             let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
+
+            if let Some(buf) = ctx.output_buffer.as_mut() {
+                if buf.len() + text.len() > buf.capacity() {
+                    let mut start = buf.len() + text.len() - buf.capacity();
+                    while start <= buf.len() && !buf.is_char_boundary(start) {
+                        start += 1;
+                    }
+                    buf.drain(..start);
+                }
+                buf.push_str(&text);
+
+                for stop_seq in &ctx.request.stopping_parameters.stop_sequences {
+                    let start = if 1 + buf.len() > text.len() + stop_seq.len() {
+                        let mut start = 1 + buf.len() - text.len() - stop_seq.len();
+                        while start > 0 && !buf.is_char_boundary(start) {
+                            start -= 1;
+                        }
+                        start
+                    } else {
+                        0
+                    };
+                    if buf[start..].contains(stop_seq) {
+                        decoded_token.is_final = true;
+                        decoded_token.finish_reason = FinishReason::StopWords;
+                    }
+                }
+            }
+
             let token = Token {
                 id: decoded_token.id,
                 text,
@@ -344,12 +375,20 @@ impl Backend for TensorRtLlmBackendV2 {
 
         // Send the context to the executor for scheduling
         let queued = Instant::now();
+        let output_buffer = request
+            .stopping_parameters
+            .stop_sequences
+            .iter()
+            .map(|x| x.len())
+            .max()
+            .map(|m| String::with_capacity(m + 32)); // TODO: is this number enough?
         match self.0.send(GenerationContext {
             request,
             streamer,
             tokens: Vec::with_capacity(256),
             start: None,
             queued,
+            output_buffer,
         }) {
             Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
             Err(_) => Err(GenerationError(

From 987337bf318597865d9a3a0f305eac0f409478a9 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 18 May 2025 02:49:35 +0800
Subject: [PATCH 5/9] feat(trtllm): catch broader exception

The trycatch only uses the `what()` method, which means we can catch the
broader `std::exception` instead. This is beneficial because
nlohmann/json also throws exception.
---
 backends/trtllm/csrc/ffi.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index 90d1b9d1..d31743ee 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,6 +1,7 @@
 #ifndef TGI_BACKEND_TRTLLM_FFI
 #define TGI_BACKEND_TRTLLM_FFI
 
+#include <exception>
 #include <memory>
 #include <thread>
 
@@ -17,7 +18,7 @@ namespace rust::behavior {
     template<typename Try, typename Fail>
     static void trycatch(Try &&func, Fail &&fail) noexcept try {
         func();
-    } catch (tensorrt_llm::common::TllmException &e) {
+    } catch (const std::exception &e) {
         fail(e.what());
     }
 }

From 56dd0a09e6229bd4ce73f1f450886f8cde6af0d4 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 18 May 2025 03:25:13 +0800
Subject: [PATCH 6/9] feat(trtllm): check existence of config files

When the required config files are not present, nlohmann/json throws
parsing error, which does not help much for identifying what was wrong.
Check the existence of these files early and return specific error
messages.
---
 backends/trtllm/src/errors.rs |  4 ++++
 backends/trtllm/src/looper.rs | 22 +++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs
index 812fd6e3..3e6bd743 100644
--- a/backends/trtllm/src/errors.rs
+++ b/backends/trtllm/src/errors.rs
@@ -19,4 +19,8 @@ pub enum TensorRtLlmBackendError {
     WebServer(#[from] server::WebServerError),
     #[error("Tokio runtime failed to start: {0}")]
     Tokio(#[from] std::io::Error),
+    #[error("config.json doesn't exist in engine folder {0}")]
+    ConfigNotFound(PathBuf),
+    #[error("generation_config.json doesn't exist in engine folder {0}")]
+    GenerationConfigNotFound(PathBuf),
 }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 6d7f30c3..17030b21 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -3,7 +3,7 @@ use cxx::UniquePtr;
 use hashbrown::HashMap;
 use std::hint;
 use std::ops::Deref;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
 use tokio::sync::TryAcquireError;
@@ -283,6 +283,26 @@ fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
         return Err(err);
     }
 
+    let mut config_path = PathBuf::from(engine_folder);
+    config_path.push("config.json");
+
+    if !config_path.exists() {
+        let err = TensorRtLlmBackendError::ConfigNotFound(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
+    let mut generation_config_path = PathBuf::from(engine_folder);
+    generation_config_path.push("generation_config.json");
+
+    if !generation_config_path.exists() {
+        let err = TensorRtLlmBackendError::GenerationConfigNotFound(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
     // Ensure executor worker binary exists
     if !executor_worker_path.exists() {
         let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf());

From 41819d70f7b8306d20f4128e33d3408df48ee728 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 18 May 2025 18:22:02 +0800
Subject: [PATCH 7/9] fix(trtllm): fix do_sample being ignored

Currently, the do_sample option is ignored and the executor will always
sample. Set top_k to 1 if do_sample is false.
---
 backends/trtllm/src/looper.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 17030b21..a4b70ea9 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -98,12 +98,17 @@ fn executor_status_looper(
                 let generation_params = &request.parameters;
                 let stopping_params = &request.stopping_parameters;
                 let input_ids = request.input_ids.as_deref();
+                let top_k = if generation_params.do_sample {
+                    generation_params.top_k
+                } else {
+                    1
+                };
 
                 // Submit to the TensorRT-LLM executor for scheduling
                 match backend.pin_mut().submit(
                     &input_ids.unwrap(), // This is checked beforehand in validate()
                     stopping_params.max_new_tokens,
-                    generation_params.top_k,
+                    top_k,
                     generation_params.top_p,
                     generation_params.temperature,
                     generation_params.repetition_penalty,

From f7bd82a90ef02048b319c87e5bcc46e22f5260f5 Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 25 May 2025 17:40:45 +0800
Subject: [PATCH 8/9] feat(trtllm): get more accurate start time

Get a more accurate inference start time from the trtllm response.
Because `Instant` does not expose absolute value, create reference
points on both sides and return duration relative to the reference
point instead.
---
 backends/trtllm/csrc/backend.cpp |  9 +++++++-
 backends/trtllm/csrc/ffi.hpp     | 36 ++++++++++++++++++++++++++------
 backends/trtllm/src/lib.rs       |  4 ++++
 backends/trtllm/src/looper.rs    | 35 ++++++++++++++++++++++++-------
 4 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index 2151466b..4a131e31 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -59,7 +59,14 @@ namespace huggingface::tgi::backends::trtllm {
                 static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
                 (tle::SamplingConfig) s_params,
-                tle::OutputConfig{ /* returnLogProbs= */ true},
+                tle::OutputConfig{ 
+                    /* returnLogProbs= */ true,
+                    false,
+                    false,
+                    false,
+                    false,
+                    /* returnPerfMetrics=*/ true,
+                },
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index d31743ee..624259cf 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,6 +1,7 @@
 #ifndef TGI_BACKEND_TRTLLM_FFI
 #define TGI_BACKEND_TRTLLM_FFI
 
+#include <chrono>
 #include <exception>
 #include <memory>
 #include <thread>
@@ -52,7 +53,7 @@ namespace huggingface::tgi::backends::trtllm {
         }
     }
 
-    static auto as_generation_step = [](const tle::Response &r) {
+    static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
         const auto reqId = r.getRequestId();
         if (!r.hasError()) [[likely]] {
             const auto result = r.getResult();
@@ -66,14 +67,23 @@ namespace huggingface::tgi::backends::trtllm {
                 log_prob = result.logProbs.value()[0].back();
             }
 
+            std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
+            if (result.requestPerfMetrics) {
+                const auto &t = result.requestPerfMetrics->timingMetrics;
+                const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
+                first_scheduled_time_ns = static_cast<int64_t>(ns);
+            }
+
             return generation_step_t{
                     reqId,
                     token_id.value_or(0),
                     log_prob.value_or(0.0),
+                    first_scheduled_time_ns.value_or(0),
                     result.isFinal,
                     as_finish_reason_t(result.finishReasons[0]),
                     token_id.has_value(),
                     log_prob.has_value(),
+                    first_scheduled_time_ns.has_value(),
                     false,
                     std::string()
             };
@@ -82,10 +92,12 @@ namespace huggingface::tgi::backends::trtllm {
                     reqId,
                     0,
                     0.0,
+                    0,
                     true,
                     finish_reason_t::kNOT_FINISHED,
                     false,
                     false,
+                    false,
                     true,
                     std::move(r.getErrorMsg())
             };
@@ -97,9 +109,16 @@ namespace huggingface::tgi::backends::trtllm {
     private:
         backend_t inner_;
 
+        // m_created_time is a reference point to convert time from c++ time_point
+        // to rust Instant.
+        std::chrono::time_point<std::chrono::steady_clock> m_created_time;
+
+
     public:
-        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-                : inner_(engine_folder, executor_worker_path) {}
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time)
+                : inner_(engine_folder, executor_worker_path),
+                  m_created_time {created_time}
+        {}
 
         size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
 
@@ -139,13 +158,16 @@ namespace huggingface::tgi::backends::trtllm {
 
                 SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
 
+                auto f = [this](const tle::Response &r){
+                    return as_generation_step(r, m_created_time);
+                };
                 // Transform tle::Response to generation_step_t
 #ifdef __cpp_lib_ranges_to_container
-                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
+                auto steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
 #else
                 auto steps = std::vector<generation_step_t>();
                 steps.reserve(responses.size());
-                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
+                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
 #endif
                 return std::make_unique<std::vector<generation_step_t>>(steps);
 
@@ -197,12 +219,14 @@ namespace huggingface::tgi::backends::trtllm {
 
     std::unique_ptr<tensorrt_llm_backend_t>
     create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+        const auto created_time = std::chrono::steady_clock::now();
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
         return std::make_unique<tensorrt_llm_backend_t>(
                 std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
                                       std::filesystem::path::format::auto_format),
                 std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
-                                      std::filesystem::path::format::auto_format)
+                                      std::filesystem::path::format::auto_format),
+                created_time
         );
     }
 }
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index b2a9274d..3a245151 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -42,10 +42,14 @@ mod ffi {
         request_id: u64,
         token_id: u32,
         log_prob: f32,
+
+        /// The time of first schedule since the creation of the backend
+        first_scheduled_time_ns: i64,
         is_final: bool,
         finish_reason: FinishReason,
         token_id_valid: bool,
         log_prob_valid: bool,
+        first_scheduled_time_ns_valid: bool,
         has_error: bool,
         error_msg: String,
     }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index a4b70ea9..34fcf34e 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -8,7 +8,7 @@ use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
 use tokio::sync::TryAcquireError;
 use tokio::task::spawn_blocking;
-use tokio::time::Instant;
+use tokio::time::{Duration, Instant};
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, warn};
 
@@ -82,6 +82,7 @@ fn executor_status_looper(
     tokenizer: Tokenizer,
     mut backend: UniquePtr<TensorRtLlmBackendImpl>,
     mut backlog: UnboundedReceiver<GenerationContext>,
+    created_time: Instant,
 ) {
     // Track the tuple (request_id, stream) for each request
     let mut in_flights =
@@ -144,12 +145,22 @@ fn executor_status_looper(
                     for step in responses.deref() {
                         if let Some(ctx) = in_flights.get_mut(&step.request_id) {
                             // Update the starting timestamp if not set
-                            // This value might not be the actual real starting time of the request
-                            // on the executor side - Need to expose more info from the executor to
-                            // retrieve this value
-                            // TODO : Expose actual real starting time for a request on FFI layer
                             if ctx.start.is_none() {
-                                ctx.start = Some(Instant::now());
+                                if step.first_scheduled_time_ns_valid {
+                                    if step.first_scheduled_time_ns >= 0 {
+                                        ctx.start = created_time.checked_add(Duration::from_nanos(
+                                            step.first_scheduled_time_ns as u64,
+                                        ));
+                                    } else {
+                                        ctx.start = created_time.checked_sub(Duration::from_nanos(
+                                            -step.first_scheduled_time_ns as u64,
+                                        ));
+                                    }
+                                }
+
+                                if ctx.start.is_none() {
+                                    ctx.start = Some(Instant::now());
+                                }
                             }
 
                             // Try to map the generation step to a DecodedToken
@@ -348,13 +359,23 @@ impl TensorRtLlmBackendV2 {
         // Allocate the IPC layer to communicate with the backend
         let (executor_sender, executor_receiver) = unbounded_channel();
 
+        // This is a reference point to convert time from c++ time_point
+        // to rust Instant.
+        let created_time = Instant::now();
+
         // Create the FFI backend
         let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
             .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval
         spawn_blocking(move || {
-            executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
+            executor_status_looper(
+                max_inflight_requests,
+                tokenizer,
+                backend,
+                executor_receiver,
+                created_time,
+            )
         });
 
         Ok(TensorRtLlmBackendV2(executor_sender))

From fab395b41fbe9b9929869cbfc8c84816997738fb Mon Sep 17 00:00:00 2001
From: Tzu-Yu Lee <leejuyuu@gmail.com>
Date: Sun, 25 May 2025 22:07:54 +0800
Subject: [PATCH 9/9] perf(trtllm): reduce futile loop iterations

The executor_status_looper runs a spin loop, even if there are no active
requests. This makes the service constantly wasting a CPU core.

Make the loop block on receiving requests if there are no running ones
to reduce CPU usage when idle.
---
 backends/trtllm/src/looper.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 34fcf34e..267f9fa9 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -90,7 +90,12 @@ fn executor_status_looper(
 
     'scheduler: loop {
         // Is there any request pending to be scheduled?
-        let awaiting_requests = backlog.len();
+        let mut awaiting_requests = backlog.len();
+        if awaiting_requests == 0 && in_flights.is_empty() {
+            // Wait for 1 request if we are not waiting for any response,
+            // so that the loop blocks at receive from backlog.
+            awaiting_requests += 1;
+        }
         for _ in 0..awaiting_requests {
             // Retrieve all the requests
             if let Some(ctx) = backlog.blocking_recv() {