text-generation-inference/backends/trtllm/src/ffi.cpp

//
// Created by mfuntowicz on 6/30/24.
//
#pragma once

#include <cmath>
#include <filesystem>
#include <vector>

#include "rust/cxx.h"
#include "backends/trtllm/include/backend.h"

namespace huggingface::tgi::backends {
    class TensorRtLlmBackendImpl : TensorRtLlmBackend {
    public:
        /***
         *
         * @param engineFolder
         * @param executorWorker
         */
        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker) :
                TensorRtLlmBackend(std::move(engineFolder), std::move(executorWorker)) {}

        /***
         *
         * @return
         */
        bool IsReady() const { return TensorRtLlmBackend::IsReady(); }

        /***
         *
         * @param tokens
         * @param maxNewTokens
         * @param topK
         * @param topP
         * @param temperature
         * @param seed
         * @return
         */
        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
        RequestId Submit(rust::Slice<const uint32_t> tokens,
                         int32_t maxNewTokens,
                         int32_t topK,
                         float_t topP,
                         float_t temperature,
                         uint64_t seed) {
            // This will copy all the items from the initial slice
            std::vector<int32_t> tokens_(tokens.size());
            tokens_.assign(tokens.begin(), tokens.end());

            return TensorRtLlmBackend::Submit(std::move(tokens_), maxNewTokens, topK, topP, temperature, seed);
        }

        /***
         *
         * @param requestId
         * @param handler
         * @return
         */
//        uint32_t
//        Stream(RequestId requestId, rust::Box <GenerationContext>, rust::Fn<void(uint32_t, uint32_t, bool)> handler) {
//            bool isDone = false;
//            uint32_t numGeneratedTokens = 0;
//
//            do {
//                const auto responses = Poll(requestId);
//                for (const auto &response: responses) {
//                    if (response.hasError()) {
//                        isDone = true;
//                        // TODO : bubble up the error to rust
//                    } else {
//                        const auto generation = response.getResult();
//                        const auto token = generation.outputTokenIds[0][0];
//                        isDone = generation.isFinal;
//
//                        // Propagate through the handler
//                        handler(token, numGeneratedTokens, isDone);
//                    }
//                }
//            } while (!isDone);
//
//            return numGeneratedTokens;
//        }
    };

    /***
    *
    * @param engineFolder
    * @return
    */
    std::unique_ptr<TensorRtLlmBackendImpl> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {
        // Unconditionally call this to initialize and discover TRTLLM plugins
        InitializeBackend();

        const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
        const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
        return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
    }
}
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`//`
			`// Created by mfuntowicz on 6/30/24.`
			`//`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#pragma once`

			`#include <cmath>`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`#include <filesystem>`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#include <vector>`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`#include "rust/cxx.h"`
Working FFI call for TGI and TRTLLM backend 2024-07-01 13:53:23 +00:00			`#include "backends/trtllm/include/backend.h"`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00
Working FFI call for TGI and TRTLLM backend 2024-07-01 13:53:23 +00:00			`namespace huggingface::tgi::backends {`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`class TensorRtLlmBackendImpl : TensorRtLlmBackend {`
			`public:`
			`/***`
			`*`
			`* @param engineFolder`
			`* @param executorWorker`
			`*/`
			`TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker) :`
			`TensorRtLlmBackend(std::move(engineFolder), std::move(executorWorker)) {}`

			`/***`
			`*`
			`* @return`
			`*/`
			`bool IsReady() const { return TensorRtLlmBackend::IsReady(); }`

			`/***`
			`*`
			`* @param tokens`
			`* @param maxNewTokens`
			`* @param topK`
			`* @param topP`
			`* @param temperature`
			`* @param seed`
			`* @return`
			`*/`
			`[[nodiscard("returned request id should be used to refer to the request's generation result later on")]]`
			`RequestId Submit(rust::Slice<const uint32_t> tokens,`
			`int32_t maxNewTokens,`
			`int32_t topK,`
			`float_t topP,`
			`float_t temperature,`
			`uint64_t seed) {`
			`// This will copy all the items from the initial slice`
			`std::vector<int32_t> tokens_(tokens.size());`
			`tokens_.assign(tokens.begin(), tokens.end());`

			`return TensorRtLlmBackend::Submit(std::move(tokens_), maxNewTokens, topK, topP, temperature, seed);`
			`}`

			`/***`
			`*`
			`* @param requestId`
			`* @param handler`
			`* @return`
			`*/`
			`// uint32_t`
			`// Stream(RequestId requestId, rust::Box <GenerationContext>, rust::Fn<void(uint32_t, uint32_t, bool)> handler) {`
			`// bool isDone = false;`
			`// uint32_t numGeneratedTokens = 0;`
			`//`
			`// do {`
			`// const auto responses = Poll(requestId);`
			`// for (const auto &response: responses) {`
			`// if (response.hasError()) {`
			`// isDone = true;`
			`// // TODO : bubble up the error to rust`
			`// } else {`
			`// const auto generation = response.getResult();`
			`// const auto token = generation.outputTokenIds[0][0];`
			`// isDone = generation.isFinal;`
			`//`
			`// // Propagate through the handler`
			`// handler(token, numGeneratedTokens, isDone);`
			`// }`
			`// }`
			`// } while (!isDone);`
			`//`
			`// return numGeneratedTokens;`
			`// }`
			`};`

Working FFI call for TGI and TRTLLM backend 2024-07-01 13:53:23 +00:00			`/***`
			`*`
			`* @param engineFolder`
			`* @return`
			`*/`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`std::unique_ptr<TensorRtLlmBackendImpl> create_trtllm_backend(rust::Str engineFolder, rust::Str executorWorker) {`
unconditionally call InitializeBackend on the FFI layer 2024-07-08 22:09:09 +00:00			`// Unconditionally call this to initialize and discover TRTLLM plugins`
			`InitializeBackend();`

Working FFI call for TGI and TRTLLM backend 2024-07-01 13:53:23 +00:00			`const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());`
First version loading engines and making it ready for inference 2024-07-03 21:12:24 +00:00			`const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());`
working setup of the ffi layer 2024-07-11 21:24:32 +00:00			`return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));`
Working FFI call for TGI and TRTLLM backend 2024-07-01 13:53:23 +00:00			`}`
Initial setup for CXX binding to TRTLLM 2024-06-30 21:37:20 +00:00			`}`