text-generation-inference/backends/trtllm/src/ffi.cpp
Funtowicz Morgan ba5fc7d922
Add support for stop words in TRTLLM (#2678)
* feat(trtllm): rewrite health to not account for current state

* chore(looper): cleanup a bit more

* feat(post_processing): max_new_tokens is const evaluated now

* chore(ffi):formatting

* feat(trtllm): add stop words handling

# Conflicts:
#	backends/trtllm/lib/backend.cpp

* chore(trtllm): create specific parallelconfig factory and logging init methods

* chore(trtllm): define a macro for SizeType cast

* chore(trtllm): use GetParallelConfig

* chore(trtllm): minor refactoring

* chore(trtllm): validate there are enough GPus on the system for the desired model

* chore(trtllm): ensure max throughput scheduling policy is selected

* chore(trtllm): minor fix

* chore(router): minor refactorings

* feat(docker): build with-slurm ompi

* feat(docker): add python3.10 dev to runtime deps

* chore(docker): add mpi to ld_library_path

* chore(docker): install transformers

* feat(trtllm): detect stop_words from generation_config.json
2024-10-25 10:58:34 +02:00

90 lines
2.9 KiB
C++

//
// Created by mfuntowicz on 6/30/24.
//
#pragma once
#include <algorithm>
#include <exception>
#include <filesystem>
#include <functional>
#include <limits>
#include <iterator>
#include <ranges>
#include <vector>
#include <spdlog/spdlog.h>
#include "backends/trtllm/include/ffi.h"
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
const std::string_view &engineFolder,
const std::string_view &executorWorker
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
rust::Slice<const uint32_t> tokens,
uint32_t maxNewTokens,
int32_t topK,
float_t topP,
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed) {
// This will copy all the items from the initial slice
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
return TensorRtLlmBackend::Submit(
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
}
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
const auto responses = TensorRtLlmBackend::PullNewTokens();
auto steps = std::make_unique<std::vector<GenerationStep>>();
steps->reserve(responses.size());
#ifndef NDEBUG
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
#endif
// Transform tle::Response to GenerationStep
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
const auto reqId = r.getRequestId();
if (!r.hasError()) {
const auto result = r.getResult();
return GenerationStep{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
result.logProbs.value()[0][0],
result.isFinal,
false,
std::string()
};
} else {
return GenerationStep{
reqId,
0,
0.0,
true,
true,
std::move(r.getErrorMsg())
};
}
});
return steps;
}
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
SPDLOG_INFO("Creating TensorRT-LLM Backend");
// Unconditionally call this to initialize and discover TRTLLM plugins
InitializeBackend();
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
}