mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder
This commit is contained in:
parent
702dc9cd05
commit
25c6bbe142
@ -1,9 +1,47 @@
|
|||||||
#include <ranges>
|
#include <ranges>
|
||||||
#include "backend.hpp"
|
#include "backend.hpp"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
|
constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
||||||
|
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||||
|
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||||
|
|
||||||
|
auto mode = tle::CommunicationMode::kLEADER;
|
||||||
|
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||||
|
|
||||||
|
if (world_size > 1) {
|
||||||
|
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||||
|
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||||
|
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
|
||||||
|
} else {
|
||||||
|
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||||
|
// Retrieve the compute capabilities to enable some options at runtime
|
||||||
|
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
||||||
|
|
||||||
|
// Allocate the config
|
||||||
|
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
|
||||||
|
|
||||||
|
// Set the parallel config as inferred
|
||||||
|
executor_config.setParallelConfig(parallel_config());
|
||||||
|
|
||||||
|
// Define some configuration variables
|
||||||
|
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||||
|
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
|
||||||
|
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
||||||
|
return executor_config;
|
||||||
|
}
|
||||||
|
|
||||||
|
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
||||||
|
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
||||||
|
|
||||||
size_t backend_t::num_tokens_ready() const noexcept {
|
size_t backend_t::num_tokens_ready() const noexcept {
|
||||||
return executor_.getNumResponsesReady();
|
return executor_.getNumResponsesReady();
|
||||||
@ -22,7 +60,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
std::nullopt,
|
std::nullopt,
|
||||||
std::nullopt,
|
std::nullopt,
|
||||||
std::nullopt,
|
std::nullopt,
|
||||||
stop_words_
|
workspace.generation_config().stop_words
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,15 +2,19 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
#include <expected>
|
#include <expected>
|
||||||
|
#include <fstream>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <span>
|
#include <span>
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
#include <spdlog/fmt/fmt.h>
|
#include <spdlog/fmt/fmt.h>
|
||||||
#include <tensorrt_llm/executor/executor.h>
|
#include <tensorrt_llm/executor/executor.h>
|
||||||
|
|
||||||
|
#include <hardware.hpp>
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
namespace tle = tensorrt_llm::executor;
|
namespace tle = tensorrt_llm::executor;
|
||||||
|
using json = nlohmann::json;
|
||||||
using request_id_t = uint32_t;
|
using request_id_t = uint32_t;
|
||||||
using token_id_t = tle::TokenIdType;
|
using token_id_t = tle::TokenIdType;
|
||||||
|
|
||||||
@ -33,7 +37,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
float_t temperature;
|
float_t temperature;
|
||||||
uint64_t seed;
|
uint64_t seed;
|
||||||
|
|
||||||
explicit operator tle::SamplingConfig() const {
|
constexpr explicit operator tle::SamplingConfig() const {
|
||||||
return tle::SamplingConfig {
|
return tle::SamplingConfig {
|
||||||
1,
|
1,
|
||||||
top_k,
|
top_k,
|
||||||
@ -53,6 +57,79 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
struct generation_config_t {
|
||||||
|
float_t top_p;
|
||||||
|
float_t temperature;
|
||||||
|
std::list<std::vector<int32_t>> stop_words;
|
||||||
|
|
||||||
|
explicit generation_config_t(const json &config):
|
||||||
|
top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
|
||||||
|
if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
|
||||||
|
const auto& eos_token_id = config["eos_token_id"];
|
||||||
|
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) {
|
||||||
|
stop_words.push_back({token_id});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class backend_workspace_t {
|
||||||
|
private:
|
||||||
|
constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
|
||||||
|
std::ifstream config_f(path);
|
||||||
|
return json::parse(config_f);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::filesystem::path engines_folder_;
|
||||||
|
std::filesystem::path executor_worker_path_;
|
||||||
|
json config_;
|
||||||
|
generation_config_t generation_config_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path):
|
||||||
|
engines_folder_(engines_folder),
|
||||||
|
executor_worker_path_(executor_worker_path),
|
||||||
|
config_(as_json(engines_folder / "config.json")),
|
||||||
|
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||||
|
|
||||||
|
backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path):
|
||||||
|
engines_folder_(engines_folder),
|
||||||
|
executor_worker_path_(executor_worker_path),
|
||||||
|
config_(as_json(engines_folder / "config.json")),
|
||||||
|
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Path to the folder containing the TensorRT-LLM engines
|
||||||
|
* @return local filesystem path to the folder
|
||||||
|
*/
|
||||||
|
[[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
[[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
[[nodiscard]] constexpr tle::ParallelConfig parallel_config() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
[[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -63,10 +140,14 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
*/
|
*/
|
||||||
class backend_t {
|
class backend_t {
|
||||||
private:
|
private:
|
||||||
|
backend_workspace_t workspace;
|
||||||
tle::Executor executor_;
|
tle::Executor executor_;
|
||||||
std::list<std::vector<int32_t>> stop_words_;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
|
||||||
|
backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
|
||||||
|
: backend_t(engines_folder, executor_worker_path) {};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Submit a new request to the executor
|
* Submit a new request to the executor
|
||||||
* @param token_ids
|
* @param token_ids
|
||||||
@ -98,6 +179,13 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
*/
|
*/
|
||||||
void cancel(request_id_t) noexcept;
|
void cancel(request_id_t) noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a TensorRT-LLM executor from a workspace
|
||||||
|
*/
|
||||||
|
const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
|
||||||
|
return { workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY, workspace.executor_config() };
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t>: formatter<string_view> {
|
template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t>: formatter<string_view> {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
|
#include <memory>
|
||||||
#include <tensorrt_llm/common/tllmException.h>
|
#include <tensorrt_llm/common/tllmException.h>
|
||||||
|
|
||||||
namespace rust::behavior {
|
namespace rust::behavior {
|
||||||
@ -13,6 +13,7 @@ namespace rust::behavior {
|
|||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
#include <spdlog/pattern_formatter.h>
|
#include <spdlog/pattern_formatter.h>
|
||||||
#include <spdlog/fmt/fmt.h>
|
#include <spdlog/fmt/fmt.h>
|
||||||
|
|
||||||
#include <backend.hpp>
|
#include <backend.hpp>
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
@ -21,7 +22,8 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
backend_t inner_;
|
backend_t inner_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
tensorrt_llm_backend_t(std::filesystem::path &engine_folder): inner_(engine_folder) {}
|
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||||
|
: inner_(engine_folder) {}
|
||||||
|
|
||||||
size_t num_tokens_ready() const noexcept {
|
size_t num_tokens_ready() const noexcept {
|
||||||
return inner_.num_tokens_ready();
|
return inner_.num_tokens_ready();
|
||||||
@ -61,4 +63,8 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
inner_.cancel(requestId);
|
inner_.cancel(requestId);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
|
||||||
|
return std::make_unique<tensorrt_llm_backend_t>(engines_folder);
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user