text-generation-inference/backends/trtllm/lib/backend.cpp

#include <fstream>

#include <nvml.h>
#include <fmt/ranges.h>
#include <spdlog/spdlog.h>

#include "backend.h"

void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();
}

[[nodiscard]]
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
    tle::ExecutorConfig execConfig(1);

    // Get the compute capabilities of the current hardware
    nvmlDevice_t device;
    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
    if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
        SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
        }
    }

    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
        SPDLOG_INFO("Detected single engine deployment, using leader mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kLEADER,
                std::nullopt,
                std::nullopt,
                std::nullopt
        ));
    } else { // Multiple engines -> using orchestrator mode (MPI involved)
        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
        execConfig.setParallelConfig(tle::ParallelConfig(
                tle::CommunicationType::kMPI,
                tle::CommunicationMode::kORCHESTRATOR,
                std::nullopt,
                std::nullopt,
                tle::OrchestratorConfig(true, workerPath)
        ));
    }

    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
    return execConfig;
}

tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
        uint32_t topK,
        float_t topP,
        float_t temperature,
        uint64_t seed,
        std::optional<int32_t> beamWidth = std::nullopt) {
    return tle::SamplingConfig(
            beamWidth.value_or(1),
            topK,
            topP,
            std::nullopt,
            std::nullopt,
            std::nullopt,
            seed,
            std::nullopt,
            temperature,
            std::nullopt
    );
}

huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
        const std::filesystem::path &enginesFolder,
        const std::filesystem::path &executorWorker
) :
        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
        executor(
                enginesFolder,
                tensorrt_llm::executor::ModelType::kDECODER_ONLY,
                GetExecutorConfig(config, executorWorker.string()
                )) {
    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());
}

bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
    return executor.canEnqueueRequests();
}

size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
    return executor.getNumResponsesReady();
}

[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        const std::vector<tle::TokenIdType> &tokens,
        const int32_t topK,
        const float_t topP,
        const float_t temperature,
        const uint64_t seed
) {
#ifdef NDEBUG
    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
            tokens.size(),
            executor.getLatestIterationStats().back().numActiveRequests
    );
#else
    SPDLOG_DEBUG(
            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
            fmt::join(tokens, ", "),
            executor.getLatestIterationStats().front().numActiveRequests
    );
#endif

    const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));

    const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
    const auto output = tle::OutputConfig(false, false, false, true, false);
    return executor.enqueueRequest(
            tle::Request{tokens, maxNewTokens, true, sampling, output});
}

[[nodiscard("Generated tokens result must be used")]]
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
    return executor.awaitResponses(requestId);
}


void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {
    SPDLOG_INFO("Shutting down executor");
    executor.shutdown();
}
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#include <fstream>`

bind to CUDA::nvml to retrieve compute capabilities at runtime 2024-07-08 22:32:41 +00:00			`#include <nvml.h>`
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`#include <fmt/ranges.h>`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`#include <spdlog/spdlog.h>`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00
			`#include "backend.h"`

make leader executor mode working 2024-07-08 22:08:49 +00:00			`void huggingface::tgi::backends::InitializeBackend() {`
			`SPDLOG_INFO("Initializing Backend...");`
bind to CUDA::nvml to retrieve compute capabilities at runtime 2024-07-08 22:32:41 +00:00			`nvmlInit_v2();`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`initTrtLlmPlugins();`
			`}`
First version loading engines and making it ready for inference 2024-07-03 21:12:24 +00:00
implement the Stream method to send new tokens through a callback 2024-07-09 13:46:48 +00:00			`[[nodiscard]]`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {`
			`tle::ExecutorConfig execConfig(1);`
Remembering to check how we can detect support for chunked context 2024-07-03 21:38:17 +00:00
updated logic and comment to detect cuda compute capabilities 2024-07-09 12:15:41 +00:00			`// Get the compute capabilities of the current hardware`
bind to CUDA::nvml to retrieve compute capabilities at runtime 2024-07-08 22:32:41 +00:00			`nvmlDevice_t device;`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {`
updated logic and comment to detect cuda compute capabilities 2024-07-09 12:15:41 +00:00			`SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {`
			`SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);`
bind to CUDA::nvml to retrieve compute capabilities at runtime 2024-07-08 22:32:41 +00:00			`}`
			`}`
make leader executor mode working 2024-07-08 22:08:49 +00:00
updated logic and comment to detect cuda compute capabilities 2024-07-09 12:15:41 +00:00			`// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`SPDLOG_INFO("Detected single engine deployment, using leader mode");`
			`execConfig.setParallelConfig(tle::ParallelConfig(`
			`tle::CommunicationType::kMPI,`
			`tle::CommunicationMode::kLEADER,`
			`std::nullopt,`
			`std::nullopt,`
			`std::nullopt`
			`));`
updated logic and comment to detect cuda compute capabilities 2024-07-09 12:15:41 +00:00			`} else { // Multiple engines -> using orchestrator mode (MPI involved)`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");`
			`execConfig.setParallelConfig(tle::ParallelConfig(`
			`tle::CommunicationType::kMPI,`
			`tle::CommunicationMode::kORCHESTRATOR,`
			`std::nullopt,`
			`std::nullopt,`
			`tle::OrchestratorConfig(true, workerPath)`
			`));`
			`}`
updated logic and comment to detect cuda compute capabilities 2024-07-09 12:15:41 +00:00
			`// Define some configuration variables`
			`execConfig.setKvCacheConfig(tle::KvCacheConfig(true));`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);`
First version loading engines and making it ready for inference 2024-07-03 21:12:24 +00:00			`return execConfig;`
			`}`

compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(`
			`uint32_t topK,`
			`float_t topP,`
			`float_t temperature,`
			`uint64_t seed,`
			`std::optional<int32_t> beamWidth = std::nullopt) {`
			`return tle::SamplingConfig(`
			`beamWidth.value_or(1),`
			`topK,`
			`topP,`
			`std::nullopt,`
			`std::nullopt,`
			`std::nullopt,`
			`seed,`
			`std::nullopt,`
			`temperature,`
			`std::nullopt`
			`);`
			`}`

First version loading engines and making it ready for inference 2024-07-03 21:12:24 +00:00			`huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`const std::filesystem::path &enginesFolder,`
First version loading engines and making it ready for inference 2024-07-03 21:12:24 +00:00			`const std::filesystem::path &executorWorker`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`) :`
			`config(json::parse(std::ifstream(enginesFolder / "config.json"))),`
			`executor(`
			`enginesFolder,`
			`tensorrt_llm::executor::ModelType::kDECODER_ONLY,`
			`GetExecutorConfig(config, executorWorker.string()`
			`)) {`
			`SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());`
			`}`

			`bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {`
			`return executor.canEnqueueRequests();`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`}`
Enable end to end CMake build 2024-07-03 08:27:53 +00:00
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {`
			`return executor.getNumResponsesReady();`
			`}`

implement the Stream method to send new tokens through a callback 2024-07-09 13:46:48 +00:00			`[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]`
Enable end to end CMake build 2024-07-03 08:27:53 +00:00			`tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`const std::vector<tle::TokenIdType> &tokens,`
			`const int32_t topK,`
Enable end to end CMake build 2024-07-03 08:27:53 +00:00			`const float_t topP,`
			`const float_t temperature,`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`const uint64_t seed`
Enable end to end CMake build 2024-07-03 08:27:53 +00:00			`) {`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`#ifdef NDEBUG`
			`SPDLOG_DEBUG(`
implement the Stream method to send new tokens through a callback 2024-07-09 13:46:48 +00:00			`FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),`
make leader executor mode working 2024-07-08 22:08:49 +00:00			`tokens.size(),`
			`executor.getLatestIterationStats().back().numActiveRequests`
			`);`
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`#else`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`SPDLOG_DEBUG(`
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),`
			`fmt::join(tokens, ", "),`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`executor.getLatestIterationStats().front().numActiveRequests`
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`);`
			`#endif`
make leader executor mode working 2024-07-08 22:08:49 +00:00
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();`
			`const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));`

			`const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);`
			`const auto output = tle::OutputConfig(false, false, false, true, false);`
oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`return executor.enqueueRequest(`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`tle::Request{tokens, maxNewTokens, true, sampling, output});`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`}`

oops missing c++ backend definitions 2024-07-16 20:11:59 +00:00			`[[nodiscard("Generated tokens result must be used")]]`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`return executor.awaitResponses(requestId);`
			`}`
expose shutdown function at ffi layer 2024-07-15 07:36:01 +00:00

			`void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {`
			`SPDLOG_INFO("Shutting down executor");`
			`executor.shutdown();`
			`}`