mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 23:42:06 +00:00
* feat(trtllm): rewrite health to not account for current state * chore(looper): cleanup a bit more * feat(post_processing): max_new_tokens is const evaluated now * chore(ffi):formatting * feat(trtllm): add stop words handling # Conflicts: # backends/trtllm/lib/backend.cpp * chore(trtllm): create specific parallelconfig factory and logging init methods * chore(trtllm): define a macro for SizeType cast * chore(trtllm): use GetParallelConfig * chore(trtllm): minor refactoring * chore(trtllm): validate there are enough GPus on the system for the desired model * chore(trtllm): ensure max throughput scheduling policy is selected * chore(trtllm): minor fix * chore(router): minor refactorings * feat(docker): build with-slurm ompi * feat(docker): add python3.10 dev to runtime deps * chore(docker): add mpi to ld_library_path * chore(docker): install transformers * feat(trtllm): detect stop_words from generation_config.json
90 lines
2.9 KiB
C++
90 lines
2.9 KiB
C++
//
|
|
// Created by mfuntowicz on 6/30/24.
|
|
//
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <exception>
|
|
#include <filesystem>
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <iterator>
|
|
#include <ranges>
|
|
#include <vector>
|
|
|
|
#include <spdlog/spdlog.h>
|
|
#include "backends/trtllm/include/ffi.h"
|
|
|
|
|
|
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
|
|
const std::string_view &engineFolder,
|
|
const std::string_view &executorWorker
|
|
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
|
|
|
|
|
|
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
|
rust::Slice<const uint32_t> tokens,
|
|
uint32_t maxNewTokens,
|
|
int32_t topK,
|
|
float_t topP,
|
|
float_t temperature,
|
|
float_t repetition_penalty,
|
|
float_t frequency_penalty,
|
|
uint64_t seed) {
|
|
|
|
// This will copy all the items from the initial slice
|
|
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
|
|
return TensorRtLlmBackend::Submit(
|
|
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
|
|
}
|
|
|
|
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
|
|
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
|
|
const auto responses = TensorRtLlmBackend::PullNewTokens();
|
|
|
|
auto steps = std::make_unique<std::vector<GenerationStep>>();
|
|
steps->reserve(responses.size());
|
|
|
|
#ifndef NDEBUG
|
|
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
|
|
#endif
|
|
|
|
// Transform tle::Response to GenerationStep
|
|
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
|
const auto reqId = r.getRequestId();
|
|
if (!r.hasError()) {
|
|
const auto result = r.getResult();
|
|
return GenerationStep{
|
|
reqId,
|
|
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
|
result.logProbs.value()[0][0],
|
|
result.isFinal,
|
|
false,
|
|
std::string()
|
|
};
|
|
} else {
|
|
return GenerationStep{
|
|
reqId,
|
|
0,
|
|
0.0,
|
|
true,
|
|
true,
|
|
std::move(r.getErrorMsg())
|
|
};
|
|
}
|
|
});
|
|
|
|
return steps;
|
|
}
|
|
|
|
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
|
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
|
|
SPDLOG_INFO("Creating TensorRT-LLM Backend");
|
|
// Unconditionally call this to initialize and discover TRTLLM plugins
|
|
InitializeBackend();
|
|
|
|
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
|
|
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
|
|
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
|
|
}
|