text-generation-inference/backends/trtllm/csrc/backend.cpp

38 lines
1.5 KiB
C++
Raw Normal View History

#include <ranges>
#include "backend.hpp"
#include <spdlog/spdlog.h>
namespace huggingface::tgi::backends::trtllm {
size_t backend_t::num_tokens_ready() const noexcept {
return executor_.getNumResponsesReady();
}
std::expected<request_id_t, backend_exception_t>
backend_t::submit(std::span<tle::TokenIdType> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept {
2024-11-30 22:04:57 +00:00
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
return executor_.enqueueRequest(tle::Request {
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
true,
(tle::SamplingConfig) sampling_params,
tle::OutputConfig { /* returnLogProbs= */ true },
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
stop_words_
});
}
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
2024-11-30 22:04:57 +00:00
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
return executor_.awaitResponses();
}
void backend_t::cancel(request_id_t request_id) noexcept {
2024-11-30 22:04:57 +00:00
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
executor_.cancelRequest(request_id);
}
}