text-generation-inference/backends/trtllm/csrc/ffi.hpp

163 lines
5.9 KiB
C++
Raw Normal View History

TensorRT-LLM backend bump to latest version + misc fixes (#2791) * misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
2024-12-13 14:50:59 +00:00
#ifndef TGI_BACKEND_TRTLLM_FFI
#define TGI_BACKEND_TRTLLM_FFI
#include <memory>
#include <thread>
#include <nvml.h>
#include <tensorrt_llm/common/tllmException.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
#include <spdlog/spdlog.h>
#include <backend.hpp>
#include <hardware.hpp>
namespace rust::behavior {
template<typename Try, typename Fail>
static void trycatch(Try &&func, Fail &&fail) noexcept try {
func();
} catch (tensorrt_llm::common::TllmException &e) {
fail(e.what());
}
}
namespace huggingface::tgi::backends::trtllm {
class tensorrt_llm_backend_t;
}
#include "backends/trtllm/src/lib.rs.h"
namespace huggingface::tgi::backends::trtllm {
std::once_flag backend_initialized_flag;
class tensorrt_llm_backend_t {
private:
backend_t inner_;
public:
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
: inner_(engine_folder, executor_worker_path) {}
size_t num_tokens_ready() const noexcept {
return inner_.num_tokens_ready();
}
request_id_t submit(
rust::Slice<const uint32_t> tokens,
uint32_t max_new_tokens,
uint32_t top_k,
float_t top_p,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed
) {
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
// Submit the request to the executor and get back a potential request_id used to track request status
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
const auto maybe_request_id = inner_.submit(
signed_tokens,
{max_new_tokens},
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
);
// If we do have a value, let's return the request_id
if(maybe_request_id.has_value()) [[likely]] {
return *maybe_request_id;
} else {
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
return maybe_request_id.error();
}
}
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
if(num_tokens_ready() > 0) [[likely]] {
const auto responses = inner_.pull_tokens();
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
// Transform tle::Response to GenerationStep
auto steps = std::make_unique<std::vector<generation_step_t>>();
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) [[likely]] {
const auto result = r.getResult();
return generation_step_t{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return generation_step_t{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
} else {
return std::make_unique<std::vector<generation_step_t>>();
}
}
void cancel(request_id_t requestId) noexcept {
SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
inner_.cancel(requestId);
}
};
void initialize_logging() {
#ifndef TGI_TRTLLM_BACKEND_DEBUG
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
return std::tolower(c);
});
if (log_level == "debug")
spdlog::set_level(spdlog::level::debug);
else
spdlog::set_level(spdlog::level::info);
}
#else
spdlog::set_level(spdlog::level::debug);
#endif
}
void initialize_tensorrt_llm_backend() {
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
// Initialize everyone
initialize_logging();
nvmlInit_v2();
initTrtLlmPlugins();
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
if (numGpus.has_value()) {
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
} else {
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
// todo: throw
}
}
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
return std::make_unique<tensorrt_llm_backend_t>(
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
);
}
}
#endif