mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 14:52:20 +00:00
feat(backend): impl missing generation_step_t as return value of pull_tokens
This commit is contained in:
parent
a7bad25c41
commit
2f8634ec01
@ -4,7 +4,7 @@ use std::env;
|
||||
use std::env::consts::ARCH;
|
||||
use std::path::{absolute, PathBuf};
|
||||
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||
const CUDA_REQUIRED_VERSION: &str = "12.6";
|
||||
const MPI_REQUIRED_VERSION: &str = "4.1";
|
||||
@ -98,7 +98,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
||||
cxx_build::bridge("src/lib.rs")
|
||||
.static_flag(true)
|
||||
.std("c++23")
|
||||
.include(deps_folder.join("fmt-src").join("include"))
|
||||
.include(deps_folder.join("spdlog-src").join("include"))
|
||||
.include(deps_folder.join("json-src").join("include"))
|
||||
.include(deps_folder.join("trtllm-src").join("cpp").join("include"))
|
||||
@ -112,7 +111,6 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
||||
println!("cargo:rerun-if-changed=CMakeLists.txt");
|
||||
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/json.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.hpp");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.cpp");
|
||||
|
@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
}
|
||||
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
||||
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
||||
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
||||
return executor_.enqueueRequest(tle::Request {
|
||||
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
||||
|
@ -1,3 +1,4 @@
|
||||
#pragma once
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <exception>
|
||||
@ -17,7 +18,7 @@
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
using json = nlohmann::json;
|
||||
using request_id_t = uint32_t;
|
||||
using request_id_t = uint64_t;
|
||||
using token_id_t = tle::TokenIdType;
|
||||
|
||||
/**
|
||||
@ -35,7 +36,6 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
float_t top_p;
|
||||
float_t repetition_penalty;
|
||||
float_t frequency_penalty;
|
||||
float_t length_penalty;
|
||||
float_t temperature;
|
||||
uint64_t seed;
|
||||
|
||||
@ -54,7 +54,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
repetition_penalty,
|
||||
std::nullopt,
|
||||
frequency_penalty,
|
||||
length_penalty
|
||||
std::nullopt
|
||||
};
|
||||
}
|
||||
};
|
||||
@ -172,7 +172,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
*/
|
||||
[[nodiscard("Discarded executor request_id needs to be assigned")]]
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
|
||||
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
|
||||
|
||||
/**
|
||||
* Query the number of tokens available across all in-flight generations
|
||||
@ -216,8 +216,8 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
|
||||
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(
|
||||
ctx.out(),
|
||||
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
|
||||
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
||||
);
|
||||
}
|
||||
};
|
@ -1,9 +1,18 @@
|
||||
#ifndef TGI_BACKEND_TRTLLM_FFI
|
||||
#define TGI_BACKEND_TRTLLM_FFI
|
||||
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/pattern_formatter.h>
|
||||
#include <spdlog/fmt/fmt.h>
|
||||
|
||||
#include <backend.hpp>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
||||
@ -13,11 +22,11 @@ namespace rust::behavior {
|
||||
}
|
||||
}
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/pattern_formatter.h>
|
||||
#include <spdlog/fmt/fmt.h>
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
class tensorrt_llm_backend_t;
|
||||
}
|
||||
|
||||
#include <backend.hpp>
|
||||
#include "backends/trtllm/src/lib.rs.h"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
std::once_flag backend_initialized_flag;
|
||||
@ -48,8 +57,9 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
|
||||
|
||||
// Submit the request to the executor and get back a potential request_id used to track request status
|
||||
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
|
||||
const auto maybe_request_id = inner_.submit(
|
||||
{tokens.data(), tokens.size()},
|
||||
signed_tokens,
|
||||
{max_new_tokens},
|
||||
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
||||
);
|
||||
@ -63,6 +73,43 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
|
||||
if(num_tokens_ready() > 0) [[likely]] {
|
||||
const auto responses = inner_.pull_tokens();
|
||||
|
||||
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
||||
// Transform tle::Response to GenerationStep
|
||||
auto steps = std::make_unique<std::vector<generation_step_t>>();
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) [[likely]] {
|
||||
const auto result = r.getResult();
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
return steps;
|
||||
|
||||
} else {
|
||||
return std::make_unique<std::vector<generation_step_t>>();
|
||||
}
|
||||
}
|
||||
|
||||
void cancel(request_id_t requestId) noexcept {
|
||||
SPDLOG_DEBUG(FMT_STRING("[FFI] cancelling request {:d}"), requestId);
|
||||
inner_.cancel(requestId);
|
||||
@ -104,7 +151,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||
return std::make_unique<tensorrt_llm_backend_t>(
|
||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
||||
@ -112,3 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
@ -1,89 +0,0 @@
|
||||
//
|
||||
// Created by mfuntowicz on 6/30/24.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "backends/trtllm/include/ffi.h"
|
||||
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
|
||||
const std::string_view &engineFolder,
|
||||
const std::string_view &executorWorker
|
||||
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
|
||||
|
||||
|
||||
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
||||
rust::Slice<const uint32_t> tokens,
|
||||
uint32_t maxNewTokens,
|
||||
int32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed) {
|
||||
|
||||
// This will copy all the items from the initial slice
|
||||
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
|
||||
return TensorRtLlmBackend::Submit(
|
||||
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
|
||||
const auto responses = TensorRtLlmBackend::PullNewTokens();
|
||||
|
||||
auto steps = std::make_unique<std::vector<GenerationStep>>();
|
||||
steps->reserve(responses.size());
|
||||
|
||||
#ifndef NDEBUG
|
||||
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
|
||||
#endif
|
||||
|
||||
// Transform tle::Response to GenerationStep
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) {
|
||||
const auto result = r.getResult();
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return steps;
|
||||
}
|
||||
|
||||
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
||||
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
|
||||
SPDLOG_INFO("Creating TensorRT-LLM Backend");
|
||||
// Unconditionally call this to initialize and discover TRTLLM plugins
|
||||
InitializeBackend();
|
||||
|
||||
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
|
||||
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
|
||||
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
|
||||
}
|
@ -4,10 +4,11 @@ pub mod errors;
|
||||
mod looper;
|
||||
mod utils;
|
||||
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends")]
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
|
||||
mod ffi {
|
||||
/// Struct used as shared type between rust and C++ to represent the result
|
||||
/// of a single decoding iteration
|
||||
#[cxx_name = "generation_step_t"]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerationStep {
|
||||
request_id: u64,
|
||||
@ -19,9 +20,10 @@ mod ffi {
|
||||
}
|
||||
|
||||
unsafe extern "C++" {
|
||||
include!("backends/trtllm/src/ffi.cpp");
|
||||
include!("backends/trtllm/csrc/ffi.hpp");
|
||||
|
||||
/// Represent an instance of the underlying TensorRT-LLM backend
|
||||
#[cxx_name = "tensorrt_llm_backend_t"]
|
||||
type TensorRtLlmBackendImpl;
|
||||
|
||||
/// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
|
||||
@ -38,21 +40,18 @@ mod ffi {
|
||||
/// ```
|
||||
///
|
||||
/// ```
|
||||
#[rust_name = "create_tensorrt_llm_backend"]
|
||||
fn CreateTensorRtLlmBackend(
|
||||
fn create_backend_from_engine_folder(
|
||||
engine_folder: &str,
|
||||
executor_worker: &str,
|
||||
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
|
||||
|
||||
#[rust_name = "num_responses_ready"]
|
||||
fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
|
||||
#[rust_name = "submit"]
|
||||
fn Submit(
|
||||
fn submit(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
tokens: &[u32],
|
||||
max_new_tokens: u32,
|
||||
top_k: i32,
|
||||
top_k: u32,
|
||||
top_p: f32,
|
||||
temperature: f32,
|
||||
repetition_penalty: f32,
|
||||
@ -60,8 +59,7 @@ mod ffi {
|
||||
seed: u64,
|
||||
) -> Result<u64>;
|
||||
|
||||
#[rust_name = "pull_tokens"]
|
||||
fn PullTokens(
|
||||
fn pull_tokens(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
|
||||
use text_generation_router::{FinishReason, Token};
|
||||
|
||||
use crate::errors::TensorRtLlmBackendError;
|
||||
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::utils::first_line;
|
||||
|
||||
type InferResult<T> = Result<T, InferError>;
|
||||
@ -93,7 +93,7 @@ fn executor_status_looper(
|
||||
match backend.pin_mut().submit(
|
||||
&input_ids.unwrap(), // This is checked beforehand in validate()
|
||||
stopping_params.max_new_tokens,
|
||||
generation_params.top_k as i32,
|
||||
generation_params.top_k,
|
||||
generation_params.top_p,
|
||||
generation_params.temperature,
|
||||
generation_params.repetition_penalty,
|
||||
@ -120,7 +120,7 @@ fn executor_status_looper(
|
||||
}
|
||||
}
|
||||
|
||||
if backend.num_responses_ready() > 0 {
|
||||
if backend.num_tokens_ready() > 0 {
|
||||
match backend.pin_mut().pull_tokens() {
|
||||
Ok(responses) => {
|
||||
// Iterate through all the decoded token
|
||||
@ -298,7 +298,7 @@ impl TensorRtLlmBackendV2 {
|
||||
let (post_processor_sender, post_processor_receiver) = unbounded_channel();
|
||||
|
||||
// Create the FFI backend
|
||||
let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
|
||||
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
|
||||
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
|
||||
|
||||
// Executor looper is responsible for scheduling and pulling requests state at regular interval
|
||||
|
Loading…
Reference in New Issue
Block a user