text-generation-inference/backends/trtllm/src/ffi.cpp

//
// Created by mfuntowicz on 6/30/24.
//
#pragma once

#include <cmath>
#include <exception>
#include <filesystem>
#include <iterator>
#include <vector>

#include <spdlog/spdlog.h>
#include "backends/trtllm/include/ffi.h"


huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
        const std::string_view &engineFolder,
        const std::string_view &executorWorker
) : TensorRtLlmBackend(engineFolder, executorWorker) {}


bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {
    return TensorRtLlmBackend::IsReady();
}

uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {

    // This will copy all the items from the initial slice
    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
    return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
}

size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
        const uint64_t requestId,
        huggingface::tgi::backends::GenerationContext *ctx,
        rust::Fn<void(huggingface::tgi::backends::GenerationContext *, uint32_t, float_t, bool)> callback) {

    size_t numTokens = 0;
    for (const auto &item: Poll(requestId)) {
        if (!item.hasError()) {
            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
            const auto decoded = item.getResult();

            const auto token = decoded.outputTokenIds[0][0];
            const auto isFinal = decoded.isFinal;
            const auto logProb = decoded.logProbs.value()[0][0];

            ++numTokens;

            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
            callback(std::move(ctx), token, logProb, isFinal);
            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
        } else {
            // TODO : Return rest::Result with error
            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
            callback(std::move(ctx), 0, 0.0, true);
        }
    }

    return numTokens;
}

std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
    // Unconditionally call this to initialize and discover TRTLLM plugins
    InitializeBackend();

    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
}
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`//`
			`// Created by mfuntowicz on 6/30/24.`
			`//`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#pragma once`

			`#include <cmath>`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`#include <exception>`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`#include <filesystem>`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`#include <iterator>`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#include <vector>`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`#include <spdlog/spdlog.h>`
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`#include "backends/trtllm/include/ffi.h"`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00
working setup of the ffi layer 2024-07-11 21:24:32 +00:00
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(`
			`const std::string_view &engineFolder,`
			`const std::string_view &executorWorker`
			`) : TensorRtLlmBackend(engineFolder, executorWorker) {}`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00

end to end ffi flow working 2024-07-12 19:25:40 +00:00			`bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {`
			`return TensorRtLlmBackend::IsReady();`
			`}`

			`uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {`
end to end ffi flow working 2024-07-12 19:25:40 +00:00
			`// This will copy all the items from the initial slice`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);`
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`}`

compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(`
			`const uint64_t requestId,`
make sure the context is not dropped in the middle of the async decoding. 2024-07-17 21:56:50 +00:00			`huggingface::tgi::backends::GenerationContext *ctx,`
			`rust::Fn<void(huggingface::tgi::backends::GenerationContext *, uint32_t, float_t, bool)> callback) {`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`size_t numTokens = 0;`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`for (const auto &item: Poll(requestId)) {`
			`if (!item.hasError()) {`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`const auto decoded = item.getResult();`

			`const auto token = decoded.outputTokenIds[0][0];`
			`const auto isFinal = decoded.isFinal;`
correctly forward back the log probabilities 2024-07-17 22:33:10 +00:00			`const auto logProb = decoded.logProbs.value()[0][0];`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00
			`++numTokens;`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);`
correctly forward back the log probabilities 2024-07-17 22:33:10 +00:00			`callback(std::move(ctx), token, logProb, isFinal);`
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`SPDLOG_DEBUG("\tStreamTokens -> Post callback");`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`} else {`
			`// TODO : Return rest::Result with error`
			`SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());`
			`callback(std::move(ctx), 0, 0.0, true);`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`}`
impl RwLock scenario for TensorRtLllmBackend 2024-07-16 20:08:10 +00:00			`}`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00
compute the number of maximum new tokens for each request independently 2024-07-17 13:55:29 +00:00			`return numTokens;`
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`}`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>`
			`huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {`
			`// Unconditionally call this to initialize and discover TRTLLM plugins`
			`InitializeBackend();`
unconditionally call InitializeBackend on the FFI layer 2024-07-08 22:09:09 +00:00
end to end ffi flow working 2024-07-12 19:25:40 +00:00			`const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());`
			`const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());`
			`return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));`
			`}`