#include #include "backend.hpp" #include namespace huggingface::tgi::backends::trtllm { size_t backend_t::num_tokens_ready() const noexcept { return executor_.getNumResponsesReady(); } std::expected backend_t::submit(std::span token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept { SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params); return executor_.enqueueRequest(tle::Request { {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens static_cast(generation_params.max_new_tokens), true, (tle::SamplingConfig) sampling_params, tle::OutputConfig { /* returnLogProbs= */ true }, std::nullopt, std::nullopt, std::nullopt, std::nullopt, stop_words_ }); } std::vector backend_t::pull_tokens() noexcept { SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready()); return executor_.awaitResponses(); } void backend_t::cancel(request_id_t request_id) noexcept { SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id); executor_.cancelRequest(request_id); } }