2024-11-30 23:35:04 +00:00
|
|
|
#include <memory>
|
2024-12-01 23:05:59 +00:00
|
|
|
#include <thread>
|
|
|
|
|
2024-11-18 23:17:35 +00:00
|
|
|
#include <tensorrt_llm/common/tllmException.h>
|
2024-12-01 23:05:59 +00:00
|
|
|
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
2024-11-18 23:17:35 +00:00
|
|
|
|
|
|
|
namespace rust::behavior {
|
|
|
|
template<typename Try, typename Fail>
|
|
|
|
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
|
|
|
func();
|
|
|
|
} catch (tensorrt_llm::common::TllmException &e) {
|
|
|
|
fail(e.what());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-30 22:04:57 +00:00
|
|
|
#include <spdlog/spdlog.h>
|
|
|
|
#include <spdlog/pattern_formatter.h>
|
|
|
|
#include <spdlog/fmt/fmt.h>
|
2024-11-30 23:35:04 +00:00
|
|
|
|
2024-11-18 23:17:35 +00:00
|
|
|
#include <backend.hpp>
|
|
|
|
|
|
|
|
namespace huggingface::tgi::backends::trtllm {
|
2024-12-01 23:05:59 +00:00
|
|
|
std::once_flag backend_initialized_flag;
|
|
|
|
|
2024-11-18 23:17:35 +00:00
|
|
|
class tensorrt_llm_backend_t {
|
|
|
|
private:
|
|
|
|
backend_t inner_;
|
|
|
|
|
|
|
|
public:
|
2024-11-30 23:35:04 +00:00
|
|
|
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
2024-12-01 23:05:59 +00:00
|
|
|
: inner_(engine_folder, executor_worker_path) {}
|
2024-11-18 23:17:35 +00:00
|
|
|
|
|
|
|
size_t num_tokens_ready() const noexcept {
|
|
|
|
return inner_.num_tokens_ready();
|
|
|
|
}
|
|
|
|
|
|
|
|
request_id_t submit(
|
|
|
|
rust::Slice<const uint32_t> tokens,
|
|
|
|
uint32_t max_new_tokens,
|
|
|
|
uint32_t top_k,
|
|
|
|
float_t top_p,
|
|
|
|
float_t temperature,
|
|
|
|
float_t repetition_penalty,
|
|
|
|
float_t frequency_penalty,
|
|
|
|
uint64_t seed
|
|
|
|
) {
|
2024-11-30 22:04:57 +00:00
|
|
|
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
|
|
|
|
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
|
|
|
|
|
2024-11-18 23:17:35 +00:00
|
|
|
// Submit the request to the executor and get back a potential request_id used to track request status
|
|
|
|
const auto maybe_request_id = inner_.submit(
|
2024-11-30 22:04:57 +00:00
|
|
|
{tokens.data(), tokens.size()},
|
2024-11-18 23:17:35 +00:00
|
|
|
{max_new_tokens},
|
|
|
|
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
|
|
|
);
|
|
|
|
|
|
|
|
// If we do have a value, let's return the request_id
|
|
|
|
if(maybe_request_id.has_value()) [[likely]] {
|
|
|
|
return *maybe_request_id;
|
|
|
|
} else {
|
2024-11-30 22:04:57 +00:00
|
|
|
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
|
2024-11-30 22:16:46 +00:00
|
|
|
return maybe_request_id.error();
|
2024-11-18 23:17:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void cancel(request_id_t requestId) noexcept {
|
2024-11-30 22:04:57 +00:00
|
|
|
SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
|
2024-11-18 23:17:35 +00:00
|
|
|
inner_.cancel(requestId);
|
|
|
|
}
|
|
|
|
};
|
2024-11-30 23:35:04 +00:00
|
|
|
|
2024-12-01 23:05:59 +00:00
|
|
|
void initialize_logging() {
|
|
|
|
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
|
|
|
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
|
|
|
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
|
|
|
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
|
|
|
return std::tolower(c);
|
|
|
|
});
|
|
|
|
|
|
|
|
if (log_level == "debug")
|
|
|
|
spdlog::set_level(spdlog::level::debug);
|
|
|
|
else
|
|
|
|
spdlog::set_level(spdlog::level::info);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
spdlog::set_level(spdlog::level::debug);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize_tensorrt_llm_backend() {
|
|
|
|
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
|
|
|
|
|
|
|
|
// Initialize everyone
|
|
|
|
initialize_logging();
|
|
|
|
nvmlInit_v2();
|
|
|
|
initTrtLlmPlugins();
|
|
|
|
|
|
|
|
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
|
|
|
|
if (numGpus.has_value()) {
|
|
|
|
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", numGpus.value());
|
|
|
|
} else {
|
|
|
|
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
|
|
|
|
// todo: throw
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-30 23:35:04 +00:00
|
|
|
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
|
2024-12-01 23:05:59 +00:00
|
|
|
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
|
|
|
return std::make_unique<tensorrt_llm_backend_t>(
|
|
|
|
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
|
|
|
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
|
|
|
|
);
|
2024-11-30 23:35:04 +00:00
|
|
|
}
|
2024-12-01 23:05:59 +00:00
|
|
|
}
|