feat(backend): impl missing generation_step_t as return value of pull_tokens

This commit is contained in:
Morgan Funtowicz 2024-12-02 23:28:25 +01:00
parent a7bad25c41
commit 2f8634ec01
7 changed files with 75 additions and 120 deletions

View File

@ -4,7 +4,7 @@ use std::env;
use std::env::consts::ARCH; use std::env::consts::ARCH;
use std::path::{absolute, PathBuf}; use std::path::{absolute, PathBuf};
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const CUDA_REQUIRED_VERSION: &str = "12.6"; const CUDA_REQUIRED_VERSION: &str = "12.6";
const MPI_REQUIRED_VERSION: &str = "4.1"; const MPI_REQUIRED_VERSION: &str = "4.1";
@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
cxx_build::bridge("src/lib.rs") cxx_build::bridge("src/lib.rs")
.static_flag(true) .static_flag(true)
.std("c++23") .std("c++23")
.include(deps_folder.join("fmt-src").join("include"))
.include(deps_folder.join("spdlog-src").join("include")) .include(deps_folder.join("spdlog-src").join("include"))
.include(deps_folder.join("json-src").join("include")) .include(deps_folder.join("json-src").join("include"))
.include(deps_folder.join("trtllm-src").join("cpp").join("include")) .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
println!("cargo:rerun-if-changed=CMakeLists.txt"); println!("cargo:rerun-if-changed=CMakeLists.txt");
println!("cargo:rerun-if-changed=cmake/trtllm.cmake"); println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
println!("cargo:rerun-if-changed=cmake/json.cmake"); println!("cargo:rerun-if-changed=cmake/json.cmake");
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
println!("cargo:rerun-if-changed=cmake/spdlog.cmake"); println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
println!("cargo:rerun-if-changed=csrc/backend.hpp"); println!("cargo:rerun-if-changed=csrc/backend.hpp");
println!("cargo:rerun-if-changed=csrc/backend.cpp"); println!("cargo:rerun-if-changed=csrc/backend.cpp");

View File

@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
} }
std::expected<request_id_t, backend_error_t> std::expected<request_id_t, backend_error_t>
backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept { backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params); SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
return executor_.enqueueRequest(tle::Request { return executor_.enqueueRequest(tle::Request {
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens

View File

@ -1,3 +1,4 @@
#pragma once
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
#include <exception> #include <exception>
@ -17,7 +18,7 @@
namespace huggingface::tgi::backends::trtllm { namespace huggingface::tgi::backends::trtllm {
namespace tle = tensorrt_llm::executor; namespace tle = tensorrt_llm::executor;
using json = nlohmann::json; using json = nlohmann::json;
using request_id_t = uint32_t; using request_id_t = uint64_t;
using token_id_t = tle::TokenIdType; using token_id_t = tle::TokenIdType;
/** /**
@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm {
float_t top_p; float_t top_p;
float_t repetition_penalty; float_t repetition_penalty;
float_t frequency_penalty; float_t frequency_penalty;
float_t length_penalty;
float_t temperature; float_t temperature;
uint64_t seed; uint64_t seed;
@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm {
repetition_penalty, repetition_penalty,
std::nullopt, std::nullopt,
frequency_penalty, frequency_penalty,
length_penalty std::nullopt
}; };
} }
}; };
@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm {
*/ */
[[nodiscard("Discarded executor request_id needs to be assigned")]] [[nodiscard("Discarded executor request_id needs to be assigned")]]
std::expected<request_id_t, backend_error_t> std::expected<request_id_t, backend_error_t>
submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept; submit(std::span<const token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
/** /**
* Query the number of tokens available across all in-flight generations * Query the number of tokens available across all in-flight generations
@ -216,8 +216,8 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator { auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
return fmt::format_to( return fmt::format_to(
ctx.out(), ctx.out(),
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}", "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
); );
} }
}; };

View File

@ -1,9 +1,18 @@
#ifndef TGI_BACKEND_TRTLLM_FFI
#define TGI_BACKEND_TRTLLM_FFI
#include <memory> #include <memory>
#include <thread> #include <thread>
#include <tensorrt_llm/common/tllmException.h> #include <tensorrt_llm/common/tllmException.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h> #include <tensorrt_llm/plugins/api/tllmPlugin.h>
#include <spdlog/spdlog.h>
#include <spdlog/pattern_formatter.h>
#include <spdlog/fmt/fmt.h>
#include <backend.hpp>
namespace rust::behavior { namespace rust::behavior {
template<typename Try, typename Fail> template<typename Try, typename Fail>
static void trycatch(Try &&func, Fail &&fail) noexcept try { static void trycatch(Try &&func, Fail &&fail) noexcept try {
@ -13,11 +22,11 @@ namespace rust::behavior {
} }
} }
#include <spdlog/spdlog.h> namespace huggingface::tgi::backends::trtllm {
#include <spdlog/pattern_formatter.h> class tensorrt_llm_backend_t;
#include <spdlog/fmt/fmt.h> }
#include <backend.hpp> #include "backends/trtllm/src/lib.rs.h"
namespace huggingface::tgi::backends::trtllm { namespace huggingface::tgi::backends::trtllm {
std::once_flag backend_initialized_flag; std::once_flag backend_initialized_flag;
@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm {
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor")); SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
// Submit the request to the executor and get back a potential request_id used to track request status // Submit the request to the executor and get back a potential request_id used to track request status
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
const auto maybe_request_id = inner_.submit( const auto maybe_request_id = inner_.submit(
{tokens.data(), tokens.size()}, signed_tokens,
{max_new_tokens}, {max_new_tokens},
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed} {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
); );
@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm {
} }
} }
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
if(num_tokens_ready() > 0) [[likely]] {
const auto responses = inner_.pull_tokens();
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
// Transform tle::Response to GenerationStep
auto steps = std::make_unique<std::vector<generation_step_t>>();
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) [[likely]] {
const auto result = r.getResult();
return generation_step_t{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return generation_step_t{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
} else {
return std::make_unique<std::vector<generation_step_t>>();
}
}
void cancel(request_id_t requestId) noexcept { void cancel(request_id_t requestId) noexcept {
SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId); SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
inner_.cancel(requestId); inner_.cancel(requestId);
@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm {
} }
} }
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) { std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend); std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
return std::make_unique<tensorrt_llm_backend_t>( return std::make_unique<tensorrt_llm_backend_t>(
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format), std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
); );
} }
} }
#endif

View File

@ -1,89 +0,0 @@
//
// Created by mfuntowicz on 6/30/24.
//
#pragma once
#include <algorithm>
#include <exception>
#include <filesystem>
#include <functional>
#include <limits>
#include <iterator>
#include <ranges>
#include <vector>
#include <spdlog/spdlog.h>
#include "backends/trtllm/include/ffi.h"
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
const std::string_view &engineFolder,
const std::string_view &executorWorker
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
rust::Slice<const uint32_t> tokens,
uint32_t maxNewTokens,
int32_t topK,
float_t topP,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed) {
// This will copy all the items from the initial slice
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
return TensorRtLlmBackend::Submit(
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
}
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
const auto responses = TensorRtLlmBackend::PullNewTokens();
auto steps = std::make_unique<std::vector<GenerationStep>>();
steps->reserve(responses.size());
#ifndef NDEBUG
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
#endif
// Transform tle::Response to GenerationStep
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) {
const auto result = r.getResult();
return GenerationStep{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return GenerationStep{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
}
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
SPDLOG_INFO("Creating TensorRT-LLM Backend");
// Unconditionally call this to initialize and discover TRTLLM plugins
InitializeBackend();
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
}

View File

@ -4,10 +4,11 @@ pub mod errors;
mod looper; mod looper;
mod utils; mod utils;
#[cxx::bridge(namespace = "huggingface::tgi::backends")] #[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
mod ffi { mod ffi {
/// Struct used as shared type between rust and C++ to represent the result /// Struct used as shared type between rust and C++ to represent the result
/// of a single decoding iteration /// of a single decoding iteration
#[cxx_name = "generation_step_t"]
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct GenerationStep { pub struct GenerationStep {
request_id: u64, request_id: u64,
@ -19,9 +20,10 @@ mod ffi {
} }
unsafe extern "C++" { unsafe extern "C++" {
include!("backends/trtllm/src/ffi.cpp"); include!("backends/trtllm/csrc/ffi.hpp");
/// Represent an instance of the underlying TensorRT-LLM backend /// Represent an instance of the underlying TensorRT-LLM backend
#[cxx_name = "tensorrt_llm_backend_t"]
type TensorRtLlmBackendImpl; type TensorRtLlmBackendImpl;
/// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@ -38,21 +40,18 @@ mod ffi {
/// ``` /// ```
/// ///
/// ``` /// ```
#[rust_name = "create_tensorrt_llm_backend"] fn create_backend_from_engine_folder(
fn CreateTensorRtLlmBackend(
engine_folder: &str, engine_folder: &str,
executor_worker: &str, executor_worker: &str,
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>; ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
#[rust_name = "num_responses_ready"] fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
#[rust_name = "submit"] fn submit(
fn Submit(
self: Pin<&mut TensorRtLlmBackendImpl>, self: Pin<&mut TensorRtLlmBackendImpl>,
tokens: &[u32], tokens: &[u32],
max_new_tokens: u32, max_new_tokens: u32,
top_k: i32, top_k: u32,
top_p: f32, top_p: f32,
temperature: f32, temperature: f32,
repetition_penalty: f32, repetition_penalty: f32,
@ -60,8 +59,7 @@ mod ffi {
seed: u64, seed: u64,
) -> Result<u64>; ) -> Result<u64>;
#[rust_name = "pull_tokens"] fn pull_tokens(
fn PullTokens(
self: Pin<&mut TensorRtLlmBackendImpl>, self: Pin<&mut TensorRtLlmBackendImpl>,
) -> Result<UniquePtr<CxxVector<GenerationStep>>>; ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
} }

View File

@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
use text_generation_router::{FinishReason, Token}; use text_generation_router::{FinishReason, Token};
use crate::errors::TensorRtLlmBackendError; use crate::errors::TensorRtLlmBackendError;
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
use crate::utils::first_line; use crate::utils::first_line;
type InferResult<T> = Result<T, InferError>; type InferResult<T> = Result<T, InferError>;
@ -93,7 +93,7 @@ fn executor_status_looper(
match backend.pin_mut().submit( match backend.pin_mut().submit(
&input_ids.unwrap(), // This is checked beforehand in validate() &input_ids.unwrap(), // This is checked beforehand in validate()
stopping_params.max_new_tokens, stopping_params.max_new_tokens,
generation_params.top_k as i32, generation_params.top_k,
generation_params.top_p, generation_params.top_p,
generation_params.temperature, generation_params.temperature,
generation_params.repetition_penalty, generation_params.repetition_penalty,
@ -120,7 +120,7 @@ fn executor_status_looper(
} }
} }
if backend.num_responses_ready() > 0 { if backend.num_tokens_ready() > 0 {
match backend.pin_mut().pull_tokens() { match backend.pin_mut().pull_tokens() {
Ok(responses) => { Ok(responses) => {
// Iterate through all the decoded token // Iterate through all the decoded token
@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 {
let (post_processor_sender, post_processor_receiver) = unbounded_channel(); let (post_processor_sender, post_processor_receiver) = unbounded_channel();
// Create the FFI backend // Create the FFI backend
let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
// Executor looper is responsible for scheduling and pulling requests state at regular interval // Executor looper is responsible for scheduling and pulling requests state at regular interval