text-generation-inference/backends/trtllm/csrc/backend.hpp

#ifndef TGI_BACKEND_TRTLLM
#define TGI_BACKEND_TRTLLM

#include <cmath>
#include <cstdint>
#include <expected>
#include <fstream>
#include <list>
#include <span>

#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include <spdlog/fmt/fmt.h>

#include <tensorrt_llm/executor/executor.h>

namespace huggingface::tgi::backends::trtllm {
    namespace tle = tensorrt_llm::executor;
    using json = nlohmann::json;
    using request_id_t = uint64_t;
    using token_id_t = tle::TokenIdType;

    /**
     * Represent the parameters used for generation
     */
    struct generation_params_t {
        uint32_t max_new_tokens;
    };

    /**
     * Represent the parameters used to sample tokens from the logit distribution
     */
    struct sampling_params_t {
        uint32_t top_k;
        float_t top_p;
        float_t repetition_penalty;
        float_t frequency_penalty;
        float_t temperature;
        uint64_t seed;

        constexpr explicit operator tle::SamplingConfig() const {
            return tle::SamplingConfig{
                    1,
                    top_k,
                    top_p,
                    std::nullopt,
                    std::nullopt,
                    std::nullopt,
                    seed,
                    temperature,
                    std::nullopt,
                    std::nullopt,
                    repetition_penalty,
                    std::nullopt,
                    frequency_penalty,
                    std::nullopt
            };
        }
    };

    /**
     * Represent possible values from transformers generation `generation_config.json`.
     * It usually stores default sampling parameters to use, such as top_p, temperature, etc.
     */
    struct generation_config_t {
        float_t top_p;
        float_t temperature;
        std::list<std::vector<int32_t>> stop_words;

        constexpr explicit generation_config_t(const json &config) :
                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
            if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
                const auto &eos_token_id = config["/eos_token_id"_json_pointer];
                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
                    stop_words.emplace_back(1, token_id.template get<int32_t>());
                });

                SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
            }
        }
    };

    /**
     * Helper class representing various items which are stored within the TensorRT-LLM engines folder and
     * can be retrieved at runtime
     */
    class backend_workspace_t {
    private:
        constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
            std::ifstream config_f(path);
            return json::parse(config_f);
        };

        std::filesystem::path engines_folder_;
        std::filesystem::path executor_worker_path_;
        json config_;
        generation_config_t generation_config_;

    public:
        backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
                engines_folder_(engines_folder),
                executor_worker_path_(executor_worker_path),
                config_(as_json(engines_folder / "config.json")),
                generation_config_(as_json(engines_folder / "generation_config.json")) {};

        backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
                engines_folder_(engines_folder),
                executor_worker_path_(executor_worker_path),
                config_(as_json(engines_folder / "config.json")),
                generation_config_(as_json(engines_folder / "generation_config.json")) {};

        /**
         * Path to the folder containing the TensorRT-LLM engines
         * @return local filesystem path to the folder
         */
        [[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }

        /**
         * Hugging Face transformers' generated `generation_config_t` mapping information stored in the
         * `generation_config.json` holding default generation parameters.
         * @return `generation_config_t`
         */
        [[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }

        /**
         * Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
         * to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
         * @return `tensorrt_llm::executor::ParallelConfig` instance
         */
        [[nodiscard]] tle::ParallelConfig parallel_config() const;

        /**
         * Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
         * to initialize `tensorrt_llm::executor::Executor`
         * @return `tensorrt_llm::executor::ExecutorConfig` instance
         */
        [[nodiscard]] tle::ExecutorConfig executor_config() const;
    };

    /**
     * Error raised by the underlying backend implementation
     */
    enum backend_error_t {
        EXECUTOR_NOT_READY = 3,
        EXECUTOR_SCHEDULING_FAILED = 4,
    };


    /**
     * Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
     * - schedule new request
     * - pull status of submitted request(s)
     * - cancel submitted request(s)
     */
    class backend_t {
    private:
        backend_workspace_t workspace;
        tle::Executor executor_;

    public:
        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);

        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
                : backend_t(engines_folder, executor_worker_path) {};

        /**
         * Submit a new request to the executor
         * @param token_ids
         * @param generation_params
         * @param sampling_params
         * @return Either newly submitted request's id or the error why it failed to submit
         */
        [[nodiscard("Discarded executor request_id needs to be assigned")]]
        std::expected<request_id_t, backend_error_t>
        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
               sampling_params_t sampling_params) noexcept;

        /**
         * Query the number of tokens available across all in-flight generations
         * @return
         */
        [[nodiscard("Pulling out the number of tokens")]]
        size_t num_tokens_ready() const noexcept;

        /**
         * Pull out newly generated tokens from the executor
         * @return
         */
        [[nodiscard("")]]
        std::vector<tle::Response> pull_tokens() noexcept;

        /**
         * Cancel the specified request on the executor' set
         * @param request_id Request's Identifier to remove from the in-flight executor
         */
        void cancel(request_id_t) noexcept;
    };

    /**
     * Create a TensorRT-LLM executor from a workspace
     */
    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
        return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
                workspace.executor_config()};
    };
}

/**
 * Helper structures to define formatting strategies for various types in the backend
 */
template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
    auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
                format_context &ctx) const -> format_context::iterator {
        return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
    }
};

template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
                format_context &ctx) const -> format_context::iterator {
        return fmt::format_to(
                ctx.out(),
                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
        );
    }
};

#endif
TensorRT-LLM backend bump to latest version + misc fixes (#2791) * misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co> 2024-12-13 14:50:59 +00:00			`#ifndef TGI_BACKEND_TRTLLM`
			`#define TGI_BACKEND_TRTLLM`

			`#include <cmath>`
			`#include <cstdint>`
			`#include <expected>`
			`#include <fstream>`
			`#include <list>`
			`#include <span>`

			`#include <nlohmann/json.hpp>`
			`#include <spdlog/spdlog.h>`
			`#include <spdlog/fmt/fmt.h>`

			`#include <tensorrt_llm/executor/executor.h>`

			`namespace huggingface::tgi::backends::trtllm {`
			`namespace tle = tensorrt_llm::executor;`
			`using json = nlohmann::json;`
			`using request_id_t = uint64_t;`
			`using token_id_t = tle::TokenIdType;`

			`/**`
			`* Represent the parameters used for generation`
			`*/`
			`struct generation_params_t {`
			`uint32_t max_new_tokens;`
			`};`

			`/**`
			`* Represent the parameters used to sample tokens from the logit distribution`
			`*/`
			`struct sampling_params_t {`
			`uint32_t top_k;`
			`float_t top_p;`
			`float_t repetition_penalty;`
			`float_t frequency_penalty;`
			`float_t temperature;`
			`uint64_t seed;`

			`constexpr explicit operator tle::SamplingConfig() const {`
			`return tle::SamplingConfig{`
			`1,`
			`top_k,`
			`top_p,`
			`std::nullopt,`
			`std::nullopt,`
			`std::nullopt,`
			`seed,`
			`temperature,`
			`std::nullopt,`
			`std::nullopt,`
			`repetition_penalty,`
			`std::nullopt,`
			`frequency_penalty,`
			`std::nullopt`
			`};`
			`}`
			`};`

			`/**`
			* Represent possible values from transformers generation `generation_config.json`.
			`* It usually stores default sampling parameters to use, such as top_p, temperature, etc.`
			`*/`
			`struct generation_config_t {`
			`float_t top_p;`
			`float_t temperature;`
			`std::list<std::vector<int32_t>> stop_words;`

			`constexpr explicit generation_config_t(const json &config) :`
			`top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {`
			`if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {`
			`const auto &eos_token_id = config["/eos_token_id"_json_pointer];`
			`std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {`
			`stop_words.emplace_back(1, token_id.template get<int32_t>());`
			`});`

			`SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());`
			`}`
			`}`
			`};`

			`/**`
			`* Helper class representing various items which are stored within the TensorRT-LLM engines folder and`
			`* can be retrieved at runtime`
			`*/`
			`class backend_workspace_t {`
			`private:`
			`constexpr static auto as_json = [](const std::filesystem::path &path) -> json {`
			`std::ifstream config_f(path);`
			`return json::parse(config_f);`
			`};`

			`std::filesystem::path engines_folder_;`
			`std::filesystem::path executor_worker_path_;`
			`json config_;`
			`generation_config_t generation_config_;`

			`public:`
			`backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :`
			`engines_folder_(engines_folder),`
			`executor_worker_path_(executor_worker_path),`
			`config_(as_json(engines_folder / "config.json")),`
			`generation_config_(as_json(engines_folder / "generation_config.json")) {};`

			`backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :`
			`engines_folder_(engines_folder),`
			`executor_worker_path_(executor_worker_path),`
			`config_(as_json(engines_folder / "config.json")),`
			`generation_config_(as_json(engines_folder / "generation_config.json")) {};`

			`/**`
			`* Path to the folder containing the TensorRT-LLM engines`
			`* @return local filesystem path to the folder`
			`*/`
			`[[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }`

			`/**`
			* Hugging Face transformers' generated `generation_config_t` mapping information stored in the
			* `generation_config.json` holding default generation parameters.
			* @return `generation_config_t`
			`*/`
			`[[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }`

			`/**`
			* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
			* to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
			* @return `tensorrt_llm::executor::ParallelConfig` instance
			`*/`
			`[[nodiscard]] tle::ParallelConfig parallel_config() const;`

			`/**`
			* Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
			* to initialize `tensorrt_llm::executor::Executor`
			* @return `tensorrt_llm::executor::ExecutorConfig` instance
			`*/`
			`[[nodiscard]] tle::ExecutorConfig executor_config() const;`
			`};`

			`/**`
			`* Error raised by the underlying backend implementation`
			`*/`
			`enum backend_error_t {`
			`EXECUTOR_NOT_READY = 3,`
			`EXECUTOR_SCHEDULING_FAILED = 4,`
			`};`


			`/**`
			`* Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to`
			`* - schedule new request`
			`* - pull status of submitted request(s)`
			`* - cancel submitted request(s)`
			`*/`
			`class backend_t {`
			`private:`
			`backend_workspace_t workspace;`
			`tle::Executor executor_;`

			`public:`
			`backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);`

			`backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)`
			`: backend_t(engines_folder, executor_worker_path) {};`

			`/**`
			`* Submit a new request to the executor`
			`* @param token_ids`
			`* @param generation_params`
			`* @param sampling_params`
			`* @return Either newly submitted request's id or the error why it failed to submit`
			`*/`
			`[[nodiscard("Discarded executor request_id needs to be assigned")]]`
			`std::expected<request_id_t, backend_error_t>`
			`submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,`
			`sampling_params_t sampling_params) noexcept;`

			`/**`
			`* Query the number of tokens available across all in-flight generations`
			`* @return`
			`*/`
			`[[nodiscard("Pulling out the number of tokens")]]`
			`size_t num_tokens_ready() const noexcept;`

			`/**`
			`* Pull out newly generated tokens from the executor`
			`* @return`
			`*/`
			`[[nodiscard("")]]`
			`std::vector<tle::Response> pull_tokens() noexcept;`

			`/**`
			`* Cancel the specified request on the executor' set`
			`* @param request_id Request's Identifier to remove from the in-flight executor`
			`*/`
			`void cancel(request_id_t) noexcept;`
			`};`

			`/**`
			`* Create a TensorRT-LLM executor from a workspace`
			`*/`
			`const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {`
			`return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,`
			`workspace.executor_config()};`
			`};`
			`}`

			`/**`
			`* Helper structures to define formatting strategies for various types in the backend`
			`*/`
			`template<>`
			`struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {`
			`auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,`
			`format_context &ctx) const -> format_context::iterator {`
			`return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);`
			`}`
			`};`

			`template<>`
			`struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {`
			`auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,`
			`format_context &ctx) const -> format_context::iterator {`
			`return fmt::format_to(`
			`ctx.out(),`
			`"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",`
			`c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed`
			`);`
			`}`
			`};`

fix: lint backend and doc files (#2850) 2024-12-16 21:12:34 +00:00			`#endif`