mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
compute the number of maximum new tokens for each request independently
This commit is contained in:
parent
a01cd030d4
commit
9220340ff7
@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
|
||||
namespace huggingface::tgi::backends {
|
||||
using RequestId = tle::IdType;
|
||||
using TokenId = tle::TokenIdType;
|
||||
using TokenStreamingCallback = void(tle::TokenIdType);
|
||||
|
||||
/**
|
||||
* Initialize all the components required by TRTLLM.
|
||||
@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
|
||||
*/
|
||||
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
||||
|
||||
/**
|
||||
* Get the sampling configuration from the parameters provided by TGI
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param seed
|
||||
* @param beamWidth
|
||||
* @return
|
||||
*/
|
||||
tle::SamplingConfig GetSamplingConfig(
|
||||
uint32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
uint64_t seed,
|
||||
std::optional<int32_t> beamWidth
|
||||
);
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
|
||||
const std::filesystem::path &executorWorker
|
||||
);
|
||||
|
||||
/***
|
||||
/**
|
||||
* Indicate if the backend is ready to accept incoming request
|
||||
* @return true if ready, false otherwise
|
||||
*/
|
||||
[[nodiscard]] bool IsReady() const;
|
||||
|
||||
/***
|
||||
/**
|
||||
* Query the executor for the number of token available for pulling
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard]] size_t NumResponsesReady() const;
|
||||
|
||||
/***
|
||||
/**
|
||||
* Submit a new generation task to the executor
|
||||
* @param tokens
|
||||
* @param maxNewTokens
|
||||
@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
|
||||
uint64_t seed
|
||||
);
|
||||
|
||||
/***
|
||||
/**
|
||||
*
|
||||
* @param requestId The request id to poll the generation results
|
||||
* @return
|
||||
*/
|
||||
std::vector<tle::Response> Poll(RequestId requestId);
|
||||
|
||||
/***
|
||||
/**
|
||||
* Stop the underlying executor
|
||||
*/
|
||||
void Shutdown();
|
||||
|
@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
||||
|
||||
// Get the compute capabilities of the current hardware
|
||||
nvmlDevice_t device;
|
||||
int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
|
||||
int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
|
||||
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
||||
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
|
||||
NVML_SUCCESS) {
|
||||
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
|
||||
cudaComputeCapabilitiesMinor);
|
||||
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
|
||||
SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
|
||||
}
|
||||
}
|
||||
|
||||
@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
||||
|
||||
// Define some configuration variables
|
||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
|
||||
execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
|
||||
return execConfig;
|
||||
}
|
||||
|
||||
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
|
||||
uint32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
uint64_t seed,
|
||||
std::optional<int32_t> beamWidth = std::nullopt) {
|
||||
return tle::SamplingConfig(
|
||||
beamWidth.value_or(1),
|
||||
topK,
|
||||
topP,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
std::nullopt,
|
||||
temperature,
|
||||
std::nullopt
|
||||
);
|
||||
}
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
|
||||
const std::filesystem::path &enginesFolder,
|
||||
const std::filesystem::path &executorWorker
|
||||
@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
||||
const float_t temperature,
|
||||
const uint64_t seed
|
||||
) {
|
||||
#ifndef NDEBUG
|
||||
SPDLOG_INFO(
|
||||
#ifdef NDEBUG
|
||||
SPDLOG_DEBUG(
|
||||
FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
|
||||
tokens.size(),
|
||||
executor.getLatestIterationStats().back().numActiveRequests
|
||||
);
|
||||
#else
|
||||
SPDLOG_INFO(
|
||||
SPDLOG_DEBUG(
|
||||
FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
|
||||
fmt::join(tokens, ", "),
|
||||
executor.getLatestIterationStats().back().numActiveRequests
|
||||
executor.getLatestIterationStats().front().numActiveRequests
|
||||
);
|
||||
#endif
|
||||
|
||||
const auto sampling = tle::SamplingConfig{
|
||||
1,
|
||||
topK,
|
||||
topP,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
std::nullopt,
|
||||
temperature,
|
||||
std::nullopt,
|
||||
};
|
||||
const auto output = tle::OutputConfig{false, false, false};
|
||||
const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
|
||||
const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
|
||||
|
||||
const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
|
||||
const auto output = tle::OutputConfig(false, false, false, true, false);
|
||||
return executor.enqueueRequest(
|
||||
tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
|
||||
tle::Request{tokens, maxNewTokens, true, sampling, output});
|
||||
}
|
||||
|
||||
[[nodiscard("Generated tokens result must be used")]]
|
||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
|
||||
SPDLOG_INFO("Polling status for request {}", requestId);
|
||||
SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
|
||||
return executor.awaitResponses(requestId);
|
||||
}
|
||||
|
||||
|
@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
|
||||
);
|
||||
|
||||
info!("Releasing lock for submit");
|
||||
return request_id;
|
||||
request_id
|
||||
})
|
||||
.await;
|
||||
|
||||
|
@ -4,7 +4,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <exception>
|
||||
#include <filesystem>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
||||
rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
|
||||
|
||||
// This will copy all the items from the initial slice
|
||||
std::vector<int32_t> tokens_(tokens.size());
|
||||
tokens_.assign(tokens.begin(), tokens.end());
|
||||
|
||||
std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
|
||||
return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
|
||||
}
|
||||
|
||||
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
|
||||
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
|
||||
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
|
||||
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
|
||||
const uint64_t requestId,
|
||||
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
|
||||
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
|
||||
|
||||
SPDLOG_INFO("Entering StreamTokens");
|
||||
size_t numTokens = 0;
|
||||
for (const auto &item: Poll(requestId)) {
|
||||
if (!item.hasError()) {
|
||||
SPDLOG_INFO("\tStreamTokens -> Decoding token...");
|
||||
SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
|
||||
const auto decoded = item.getResult();
|
||||
SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());
|
||||
|
||||
const auto token = decoded.outputTokenIds[0][0];
|
||||
const auto isFinal = decoded.isFinal;
|
||||
// const auto logProb = decoded.logProbs.value()[0][0];
|
||||
const auto logProb = 0.0;
|
||||
const auto logProb = decoded.logProbs.value()[0][0];
|
||||
|
||||
SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
|
||||
++numTokens;
|
||||
|
||||
SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
|
||||
callback(std::move(ctx), token, logProb, isFinal);
|
||||
SPDLOG_INFO("\tStreamTokens -> Post callback");
|
||||
SPDLOG_DEBUG("\tStreamTokens -> Post callback");
|
||||
} else {
|
||||
// TODO : Return rest::Result with error
|
||||
SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
|
||||
@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
|
||||
}
|
||||
}
|
||||
|
||||
SPDLOG_INFO("Exiting StreamTokens");
|
||||
return 0;
|
||||
return numTokens;
|
||||
}
|
||||
|
||||
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
||||
|
Loading…
Reference in New Issue
Block a user