mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-27 10:20:17 +00:00
feat(trtllm): get more accurate start time
Get a more accurate inference start time from the trtllm response. Because `Instant` does not expose absolute value, create reference points on both sides and return duration relative to the reference point instead.
This commit is contained in:
parent
41819d70f7
commit
f7bd82a90e
@ -59,7 +59,14 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
static_cast<tle::SizeType32>(g_params.max_new_tokens),
|
||||
true,
|
||||
(tle::SamplingConfig) s_params,
|
||||
tle::OutputConfig{ /* returnLogProbs= */ true},
|
||||
tle::OutputConfig{
|
||||
/* returnLogProbs= */ true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
/* returnPerfMetrics=*/ true,
|
||||
},
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef TGI_BACKEND_TRTLLM_FFI
|
||||
#define TGI_BACKEND_TRTLLM_FFI
|
||||
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
@ -52,7 +53,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
}
|
||||
}
|
||||
|
||||
static auto as_generation_step = [](const tle::Response &r) {
|
||||
static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) [[likely]] {
|
||||
const auto result = r.getResult();
|
||||
@ -66,14 +67,23 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
log_prob = result.logProbs.value()[0].back();
|
||||
}
|
||||
|
||||
std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
|
||||
if (result.requestPerfMetrics) {
|
||||
const auto &t = result.requestPerfMetrics->timingMetrics;
|
||||
const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
|
||||
first_scheduled_time_ns = static_cast<int64_t>(ns);
|
||||
}
|
||||
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
token_id.value_or(0),
|
||||
log_prob.value_or(0.0),
|
||||
first_scheduled_time_ns.value_or(0),
|
||||
result.isFinal,
|
||||
as_finish_reason_t(result.finishReasons[0]),
|
||||
token_id.has_value(),
|
||||
log_prob.has_value(),
|
||||
first_scheduled_time_ns.has_value(),
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
@ -82,10 +92,12 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
0,
|
||||
true,
|
||||
finish_reason_t::kNOT_FINISHED,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
@ -97,9 +109,16 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
private:
|
||||
backend_t inner_;
|
||||
|
||||
// m_created_time is a reference point to convert time from c++ time_point
|
||||
// to rust Instant.
|
||||
std::chrono::time_point<std::chrono::steady_clock> m_created_time;
|
||||
|
||||
|
||||
public:
|
||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||
: inner_(engine_folder, executor_worker_path) {}
|
||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time)
|
||||
: inner_(engine_folder, executor_worker_path),
|
||||
m_created_time {created_time}
|
||||
{}
|
||||
|
||||
size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
|
||||
|
||||
@ -139,13 +158,16 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
|
||||
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
||||
|
||||
auto f = [this](const tle::Response &r){
|
||||
return as_generation_step(r, m_created_time);
|
||||
};
|
||||
// Transform tle::Response to generation_step_t
|
||||
#ifdef __cpp_lib_ranges_to_container
|
||||
auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
|
||||
auto steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
|
||||
#else
|
||||
auto steps = std::vector<generation_step_t>();
|
||||
steps.reserve(responses.size());
|
||||
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
|
||||
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
|
||||
#endif
|
||||
return std::make_unique<std::vector<generation_step_t>>(steps);
|
||||
|
||||
@ -197,12 +219,14 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t>
|
||||
create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
||||
const auto created_time = std::chrono::steady_clock::now();
|
||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||
return std::make_unique<tensorrt_llm_backend_t>(
|
||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
|
||||
std::filesystem::path::format::auto_format),
|
||||
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
|
||||
std::filesystem::path::format::auto_format)
|
||||
std::filesystem::path::format::auto_format),
|
||||
created_time
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -42,10 +42,14 @@ mod ffi {
|
||||
request_id: u64,
|
||||
token_id: u32,
|
||||
log_prob: f32,
|
||||
|
||||
/// The time of first schedule since the creation of the backend
|
||||
first_scheduled_time_ns: i64,
|
||||
is_final: bool,
|
||||
finish_reason: FinishReason,
|
||||
token_id_valid: bool,
|
||||
log_prob_valid: bool,
|
||||
first_scheduled_time_ns_valid: bool,
|
||||
has_error: bool,
|
||||
error_msg: String,
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ use tokenizers::Tokenizer;
|
||||
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
|
||||
use tokio::sync::TryAcquireError;
|
||||
use tokio::task::spawn_blocking;
|
||||
use tokio::time::Instant;
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||
use tracing::{debug, error, warn};
|
||||
|
||||
@ -82,6 +82,7 @@ fn executor_status_looper(
|
||||
tokenizer: Tokenizer,
|
||||
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
|
||||
mut backlog: UnboundedReceiver<GenerationContext>,
|
||||
created_time: Instant,
|
||||
) {
|
||||
// Track the tuple (request_id, stream) for each request
|
||||
let mut in_flights =
|
||||
@ -144,12 +145,22 @@ fn executor_status_looper(
|
||||
for step in responses.deref() {
|
||||
if let Some(ctx) = in_flights.get_mut(&step.request_id) {
|
||||
// Update the starting timestamp if not set
|
||||
// This value might not be the actual real starting time of the request
|
||||
// on the executor side - Need to expose more info from the executor to
|
||||
// retrieve this value
|
||||
// TODO : Expose actual real starting time for a request on FFI layer
|
||||
if ctx.start.is_none() {
|
||||
ctx.start = Some(Instant::now());
|
||||
if step.first_scheduled_time_ns_valid {
|
||||
if step.first_scheduled_time_ns >= 0 {
|
||||
ctx.start = created_time.checked_add(Duration::from_nanos(
|
||||
step.first_scheduled_time_ns as u64,
|
||||
));
|
||||
} else {
|
||||
ctx.start = created_time.checked_sub(Duration::from_nanos(
|
||||
-step.first_scheduled_time_ns as u64,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if ctx.start.is_none() {
|
||||
ctx.start = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
// Try to map the generation step to a DecodedToken
|
||||
@ -348,13 +359,23 @@ impl TensorRtLlmBackendV2 {
|
||||
// Allocate the IPC layer to communicate with the backend
|
||||
let (executor_sender, executor_receiver) = unbounded_channel();
|
||||
|
||||
// This is a reference point to convert time from c++ time_point
|
||||
// to rust Instant.
|
||||
let created_time = Instant::now();
|
||||
|
||||
// Create the FFI backend
|
||||
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
|
||||
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
|
||||
|
||||
// Executor looper is responsible for scheduling and pulling requests state at regular interval
|
||||
spawn_blocking(move || {
|
||||
executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
|
||||
executor_status_looper(
|
||||
max_inflight_requests,
|
||||
tokenizer,
|
||||
backend,
|
||||
executor_receiver,
|
||||
created_time,
|
||||
)
|
||||
});
|
||||
|
||||
Ok(TensorRtLlmBackendV2(executor_sender))
|
||||
|
Loading…
Reference in New Issue
Block a user