mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
* misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
80 lines
3.5 KiB
C++
80 lines
3.5 KiB
C++
#include <ranges>
|
|
|
|
#include <nlohmann/json.hpp>
|
|
#include <spdlog/spdlog.h>
|
|
|
|
#include "backend.hpp"
|
|
#include "hardware.hpp"
|
|
|
|
namespace huggingface::tgi::backends::trtllm {
|
|
tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
|
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
|
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
|
|
|
auto mode = tle::CommunicationMode::kLEADER;
|
|
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
|
|
|
if (world_size > 1) {
|
|
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
|
mode = tle::CommunicationMode::kORCHESTRATOR;
|
|
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
|
|
} else {
|
|
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
|
}
|
|
|
|
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
|
}
|
|
|
|
|
|
tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
|
// Retrieve the compute capabilities to enable some options at runtime
|
|
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
|
|
|
// Allocate the config
|
|
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
|
|
|
|
// Set the parallel config as inferred
|
|
executor_config.setParallelConfig(parallel_config());
|
|
|
|
// Define some configuration variables
|
|
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
|
|
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
|
|
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
|
return executor_config;
|
|
}
|
|
|
|
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
|
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
|
|
|
size_t backend_t::num_tokens_ready() const noexcept {
|
|
return executor_.getNumResponsesReady();
|
|
}
|
|
|
|
std::expected<request_id_t, backend_error_t>
|
|
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
|
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
|
return executor_.enqueueRequest(tle::Request {
|
|
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
|
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
|
|
true,
|
|
(tle::SamplingConfig) sampling_params,
|
|
tle::OutputConfig { /* returnLogProbs= */ true },
|
|
std::nullopt,
|
|
std::nullopt,
|
|
std::nullopt,
|
|
std::nullopt,
|
|
workspace.generation_config().stop_words
|
|
});
|
|
}
|
|
|
|
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
|
|
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
|
|
return executor_.awaitResponses();
|
|
}
|
|
|
|
void backend_t::cancel(request_id_t request_id) noexcept {
|
|
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
|
executor_.cancelRequest(request_id);
|
|
}
|
|
}
|