#include #include #include "backend.h" huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) { SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder); } tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( std::vector &tokens, const int32_t maxNewTokens, const float_t topK, const float_t topP, const float_t temperature, const int32_t minLength, const std::optional repetitionPenalty, const std::optional frequencePenalty, const std::optional seed, const std::optional nTopTokens ) { if (IsReady()) { spdlog::debug( "Submitting inference over {:d} tokens to the executor {:d}", tokens.size(), executor.getLatestIterationStats().back().numActiveRequests ); const auto sampling = tle::SamplingConfig{ 1, topK, topP, std::nullopt, std::nullopt, std::nullopt, seed, temperature, minLength, std::nullopt, repetitionPenalty.value_or(0.0), std::nullopt, frequencePenalty.value_or(1.0), }; const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; return executor.enqueueRequest(request); } return 0; }