text-generation-inference/backends/trtllm/lib/backend.cpp

#include <spdlog/spdlog.h>
#include <fmt/std.h>

#include "backend.h"

huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)
        : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {
    SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);
}

tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        std::vector<tle::TokenIdType> &tokens,
        const int32_t maxNewTokens,
        const float_t topK,
        const float_t topP,
        const float_t temperature,
        const int32_t minLength,
        const std::optional<float_t> repetitionPenalty,
        const std::optional<float_t> frequencePenalty,
        const std::optional<uint32_t> seed,
        const std::optional<uint32_t> nTopTokens
) {
    if (IsReady()) {
        spdlog::debug(
                "Submitting inference over {:d} tokens to the executor {:d}",
                tokens.size(),
                executor.getLatestIterationStats().back().numActiveRequests
        );

        const auto sampling = tle::SamplingConfig{
                1,
                topK,
                topP,
                std::nullopt,
                std::nullopt,
                std::nullopt,
                seed,
                temperature,
                minLength,
                std::nullopt,
                repetitionPenalty.value_or(0.0),
                std::nullopt,
                frequencePenalty.value_or(1.0),
        };
        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};

        return executor.enqueueRequest(request);
    }
    return 0;
}
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`#include <spdlog/spdlog.h>`
			`#include <fmt/std.h>`

			`#include "backend.h"`

Enable end to end CMake build 2024-07-03 08:27:53 +00:00			`huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)`
			`: executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);`
			`}`
Enable end to end CMake build 2024-07-03 08:27:53 +00:00
			`tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(`
			`std::vector<tle::TokenIdType> &tokens,`
			`const int32_t maxNewTokens,`
			`const float_t topK,`
			`const float_t topP,`
			`const float_t temperature,`
			`const int32_t minLength,`
			`const std::optional<float_t> repetitionPenalty,`
			`const std::optional<float_t> frequencePenalty,`
			`const std::optional<uint32_t> seed,`
			`const std::optional<uint32_t> nTopTokens`
			`) {`
			`if (IsReady()) {`
			`spdlog::debug(`
			`"Submitting inference over {:d} tokens to the executor {:d}",`
			`tokens.size(),`
			`executor.getLatestIterationStats().back().numActiveRequests`
			`);`

			`const auto sampling = tle::SamplingConfig{`
			`1,`
			`topK,`
			`topP,`
			`std::nullopt,`
			`std::nullopt,`
			`std::nullopt,`
			`seed,`
			`temperature,`
			`minLength,`
			`std::nullopt,`
			`repetitionPenalty.value_or(0.0),`
			`std::nullopt,`
			`frequencePenalty.value_or(1.0),`
			`};`
			`const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};`
			`const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};`

			`return executor.enqueueRequest(request);`
			`}`
			`return 0;`
			`}`