From f5b9ee368af1ae37833ac7b02c6cb7bff5e7e33d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 17:03:35 +0200 Subject: [PATCH] Revert "chore(trtllm): remove unused method" This reverts commit 31747163 --- backends/trtllm/include/backend.h | 6 ++++++ backends/trtllm/lib/backend.cpp | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 7e6b8ab9..5b2963a8 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -79,6 +79,12 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); + /** + * Query the executor for the number of token available for pulling + * @return + */ + [[nodiscard]] size_t NumResponsesReady() const; + /** * Submit a new generation task to the executor * @param tokens diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 269b381f..72a75e2a 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -106,6 +106,17 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); } +[[nodiscard("Returned number of requests needs to be consumed")]] +size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { + const auto numResponses = executor.getNumResponsesReady(); + +#ifndef NDEBUG + if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); +#endif + + return numResponses; +} + [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, @@ -122,9 +133,8 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( { const auto &iterations = executor.getLatestIterationStats(); const auto &lastIteration = iterations.front(); + SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); - - SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); }