From 25c6bbe142f668ac86e05c0c00db3135c7c7a1bb Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 1 Dec 2024 00:35:04 +0100 Subject: [PATCH] feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder --- backends/trtllm/csrc/backend.cpp | 40 +++++++++++++- backends/trtllm/csrc/backend.hpp | 94 +++++++++++++++++++++++++++++++- backends/trtllm/csrc/ffi.hpp | 10 +++- 3 files changed, 138 insertions(+), 6 deletions(-) diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp index 2b2e0239..bc3e33de 100644 --- a/backends/trtllm/csrc/backend.cpp +++ b/backends/trtllm/csrc/backend.cpp @@ -1,9 +1,47 @@ #include #include "backend.hpp" +#include #include namespace huggingface::tgi::backends::trtllm { + constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const { + // Single engine (TP = PP = 1) -> using leader mode (no MPI involved) + const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get(); + + auto mode = tle::CommunicationMode::kLEADER; + std::optional orchestratorConfig = std::nullopt; + + if (world_size > 1) { + SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); + mode = tle::CommunicationMode::kORCHESTRATOR; + orchestratorConfig = std::make_optional(true, executor_worker_path_, nullptr, true); + } else { + SPDLOG_INFO("Detected single engine deployment, using leader mode"); + } + + return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig); + } + + constexpr tle::ExecutorConfig backend_workspace_t::executor_config() const { + // Retrieve the compute capabilities to enable some options at runtime + const auto compute_capabilities = hardware::cuda::compute_capabilities_t(); + + // Allocate the config + tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1); + + // Set the parallel config as inferred + executor_config.setParallelConfig(parallel_config()); + + // Define some configuration variables + executor_config.setKvCacheConfig(tle::KvCacheConfig(true)); + executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere()); + executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)); + return executor_config; + } + + backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) + : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {} size_t backend_t::num_tokens_ready() const noexcept { return executor_.getNumResponsesReady(); @@ -22,7 +60,7 @@ namespace huggingface::tgi::backends::trtllm { std::nullopt, std::nullopt, std::nullopt, - stop_words_ + workspace.generation_config().stop_words }); } diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp index d17a344c..69724187 100644 --- a/backends/trtllm/csrc/backend.hpp +++ b/backends/trtllm/csrc/backend.hpp @@ -2,15 +2,19 @@ #include #include #include +#include #include #include +#include #include #include +#include + namespace huggingface::tgi::backends::trtllm { namespace tle = tensorrt_llm::executor; - + using json = nlohmann::json; using request_id_t = uint32_t; using token_id_t = tle::TokenIdType; @@ -33,7 +37,7 @@ namespace huggingface::tgi::backends::trtllm { float_t temperature; uint64_t seed; - explicit operator tle::SamplingConfig() const { + constexpr explicit operator tle::SamplingConfig() const { return tle::SamplingConfig { 1, top_k, @@ -53,6 +57,79 @@ namespace huggingface::tgi::backends::trtllm { } }; + /** + * + */ + struct generation_config_t { + float_t top_p; + float_t temperature; + std::list> stop_words; + + explicit generation_config_t(const json &config): + top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) { + if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) { + const auto& eos_token_id = config["eos_token_id"]; + std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) { + stop_words.push_back({token_id}); + }); + } + } + }; + + /** + * + */ + class backend_workspace_t { + private: + constexpr static auto as_json = [](const std::filesystem::path &path) -> json { + std::ifstream config_f(path); + return json::parse(config_f); + }; + + std::filesystem::path engines_folder_; + std::filesystem::path executor_worker_path_; + json config_; + generation_config_t generation_config_; + + public: + backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path): + engines_folder_(engines_folder), + executor_worker_path_(executor_worker_path), + config_(as_json(engines_folder / "config.json")), + generation_config_(as_json(engines_folder / "generation_config.json")) {}; + + backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path): + engines_folder_(engines_folder), + executor_worker_path_(executor_worker_path), + config_(as_json(engines_folder / "config.json")), + generation_config_(as_json(engines_folder / "generation_config.json")) {}; + + /** + * Path to the folder containing the TensorRT-LLM engines + * @return local filesystem path to the folder + */ + [[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; } + + /** + * + * @return + */ + [[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; } + + /** + * + * @return + */ + [[nodiscard]] constexpr tle::ParallelConfig parallel_config() const; + + /** + * + * @return + */ + [[nodiscard]] constexpr tle::ExecutorConfig executor_config() const; + }; + + /** * */ @@ -63,10 +140,14 @@ namespace huggingface::tgi::backends::trtllm { */ class backend_t { private: + backend_workspace_t workspace; tle::Executor executor_; - std::list> stop_words_; public: + backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path); + backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) + : backend_t(engines_folder, executor_worker_path) {}; + /** * Submit a new request to the executor * @param token_ids @@ -98,6 +179,13 @@ namespace huggingface::tgi::backends::trtllm { */ void cancel(request_id_t) noexcept; }; + + /** + * Create a TensorRT-LLM executor from a workspace + */ + const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor { + return { workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY, workspace.executor_config() }; + }; } template <> struct fmt::formatter: formatter { diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index 311a7981..b964a064 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -1,4 +1,4 @@ - +#include #include namespace rust::behavior { @@ -13,6 +13,7 @@ namespace rust::behavior { #include #include #include + #include namespace huggingface::tgi::backends::trtllm { @@ -21,7 +22,8 @@ namespace huggingface::tgi::backends::trtllm { backend_t inner_; public: - tensorrt_llm_backend_t(std::filesystem::path &engine_folder): inner_(engine_folder) {} + tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path) + : inner_(engine_folder) {} size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); @@ -61,4 +63,8 @@ namespace huggingface::tgi::backends::trtllm { inner_.cancel(requestId); } }; + + std::unique_ptr create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) { + return std::make_unique(engines_folder); + } } \ No newline at end of file