mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
compute the number of maximum new tokens for each request independently
This commit is contained in:
parent
a01cd030d4
commit
9220340ff7
@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
|
|||||||
namespace huggingface::tgi::backends {
|
namespace huggingface::tgi::backends {
|
||||||
using RequestId = tle::IdType;
|
using RequestId = tle::IdType;
|
||||||
using TokenId = tle::TokenIdType;
|
using TokenId = tle::TokenIdType;
|
||||||
using TokenStreamingCallback = void(tle::TokenIdType);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize all the components required by TRTLLM.
|
* Initialize all the components required by TRTLLM.
|
||||||
@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
|
|||||||
*/
|
*/
|
||||||
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the sampling configuration from the parameters provided by TGI
|
||||||
|
* @param topK
|
||||||
|
* @param topP
|
||||||
|
* @param temperature
|
||||||
|
* @param seed
|
||||||
|
* @param beamWidth
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
tle::SamplingConfig GetSamplingConfig(
|
||||||
|
uint32_t topK,
|
||||||
|
float_t topP,
|
||||||
|
float_t temperature,
|
||||||
|
uint64_t seed,
|
||||||
|
std::optional<int32_t> beamWidth
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
|
|||||||
const std::filesystem::path &executorWorker
|
const std::filesystem::path &executorWorker
|
||||||
);
|
);
|
||||||
|
|
||||||
/***
|
/**
|
||||||
* Indicate if the backend is ready to accept incoming request
|
* Indicate if the backend is ready to accept incoming request
|
||||||
* @return true if ready, false otherwise
|
* @return true if ready, false otherwise
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] bool IsReady() const;
|
[[nodiscard]] bool IsReady() const;
|
||||||
|
|
||||||
/***
|
/**
|
||||||
* Query the executor for the number of token available for pulling
|
* Query the executor for the number of token available for pulling
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] size_t NumResponsesReady() const;
|
[[nodiscard]] size_t NumResponsesReady() const;
|
||||||
|
|
||||||
/***
|
/**
|
||||||
* Submit a new generation task to the executor
|
* Submit a new generation task to the executor
|
||||||
* @param tokens
|
* @param tokens
|
||||||
* @param maxNewTokens
|
* @param maxNewTokens
|
||||||
@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
|
|||||||
uint64_t seed
|
uint64_t seed
|
||||||
);
|
);
|
||||||
|
|
||||||
/***
|
/**
|
||||||
*
|
*
|
||||||
* @param requestId The request id to poll the generation results
|
* @param requestId The request id to poll the generation results
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
std::vector<tle::Response> Poll(RequestId requestId);
|
std::vector<tle::Response> Poll(RequestId requestId);
|
||||||
|
|
||||||
/***
|
/**
|
||||||
* Stop the underlying executor
|
* Stop the underlying executor
|
||||||
*/
|
*/
|
||||||
void Shutdown();
|
void Shutdown();
|
||||||
|
@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
|||||||
|
|
||||||
// Get the compute capabilities of the current hardware
|
// Get the compute capabilities of the current hardware
|
||||||
nvmlDevice_t device;
|
nvmlDevice_t device;
|
||||||
int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
|
int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
|
||||||
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||||
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
||||||
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
|
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
|
||||||
NVML_SUCCESS) {
|
SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
|
||||||
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
|
|
||||||
cudaComputeCapabilitiesMinor);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
|||||||
|
|
||||||
// Define some configuration variables
|
// Define some configuration variables
|
||||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||||
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
|
execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
|
||||||
return execConfig;
|
return execConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
|
||||||
|
uint32_t topK,
|
||||||
|
float_t topP,
|
||||||
|
float_t temperature,
|
||||||
|
uint64_t seed,
|
||||||
|
std::optional<int32_t> beamWidth = std::nullopt) {
|
||||||
|
return tle::SamplingConfig(
|
||||||
|
beamWidth.value_or(1),
|
||||||
|
topK,
|
||||||
|
topP,
|
||||||
|
std::nullopt,
|
||||||
|
std::nullopt,
|
||||||
|
std::nullopt,
|
||||||
|
seed,
|
||||||
|
std::nullopt,
|
||||||
|
temperature,
|
||||||
|
std::nullopt
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
|
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
|
||||||
const std::filesystem::path &enginesFolder,
|
const std::filesystem::path &enginesFolder,
|
||||||
const std::filesystem::path &executorWorker
|
const std::filesystem::path &executorWorker
|
||||||
@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
|||||||
const float_t temperature,
|
const float_t temperature,
|
||||||
const uint64_t seed
|
const uint64_t seed
|
||||||
) {
|
) {
|
||||||
#ifndef NDEBUG
|
#ifdef NDEBUG
|
||||||
SPDLOG_INFO(
|
SPDLOG_DEBUG(
|
||||||
FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
|
FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
|
||||||
tokens.size(),
|
tokens.size(),
|
||||||
executor.getLatestIterationStats().back().numActiveRequests
|
executor.getLatestIterationStats().back().numActiveRequests
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
SPDLOG_INFO(
|
SPDLOG_DEBUG(
|
||||||
FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
|
FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
|
||||||
fmt::join(tokens, ", "),
|
fmt::join(tokens, ", "),
|
||||||
executor.getLatestIterationStats().back().numActiveRequests
|
executor.getLatestIterationStats().front().numActiveRequests
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const auto sampling = tle::SamplingConfig{
|
const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
|
||||||
1,
|
const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
|
||||||
topK,
|
|
||||||
topP,
|
const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
|
||||||
std::nullopt,
|
const auto output = tle::OutputConfig(false, false, false, true, false);
|
||||||
std::nullopt,
|
|
||||||
std::nullopt,
|
|
||||||
seed,
|
|
||||||
std::nullopt,
|
|
||||||
temperature,
|
|
||||||
std::nullopt,
|
|
||||||
};
|
|
||||||
const auto output = tle::OutputConfig{false, false, false};
|
|
||||||
return executor.enqueueRequest(
|
return executor.enqueueRequest(
|
||||||
tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
|
tle::Request{tokens, maxNewTokens, true, sampling, output});
|
||||||
}
|
}
|
||||||
|
|
||||||
[[nodiscard("Generated tokens result must be used")]]
|
[[nodiscard("Generated tokens result must be used")]]
|
||||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
|
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
|
||||||
SPDLOG_INFO("Polling status for request {}", requestId);
|
SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
|
||||||
return executor.awaitResponses(requestId);
|
return executor.awaitResponses(requestId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
|
|||||||
);
|
);
|
||||||
|
|
||||||
info!("Releasing lock for submit");
|
info!("Releasing lock for submit");
|
||||||
return request_id;
|
request_id
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
@ -4,7 +4,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <exception>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
|
#include <iterator>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
|||||||
rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
|
rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
|
||||||
|
|
||||||
// This will copy all the items from the initial slice
|
// This will copy all the items from the initial slice
|
||||||
std::vector<int32_t> tokens_(tokens.size());
|
std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
|
||||||
tokens_.assign(tokens.begin(), tokens.end());
|
|
||||||
|
|
||||||
return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
|
return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
|
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
|
||||||
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
|
const uint64_t requestId,
|
||||||
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
|
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
|
||||||
|
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
|
||||||
|
|
||||||
SPDLOG_INFO("Entering StreamTokens");
|
size_t numTokens = 0;
|
||||||
for (const auto &item: Poll(requestId)) {
|
for (const auto &item: Poll(requestId)) {
|
||||||
if (!item.hasError()) {
|
if (!item.hasError()) {
|
||||||
SPDLOG_INFO("\tStreamTokens -> Decoding token...");
|
SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
|
||||||
const auto decoded = item.getResult();
|
const auto decoded = item.getResult();
|
||||||
SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());
|
|
||||||
|
|
||||||
const auto token = decoded.outputTokenIds[0][0];
|
const auto token = decoded.outputTokenIds[0][0];
|
||||||
const auto isFinal = decoded.isFinal;
|
const auto isFinal = decoded.isFinal;
|
||||||
// const auto logProb = decoded.logProbs.value()[0][0];
|
const auto logProb = decoded.logProbs.value()[0][0];
|
||||||
const auto logProb = 0.0;
|
|
||||||
|
|
||||||
SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
|
++numTokens;
|
||||||
|
|
||||||
|
SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
|
||||||
callback(std::move(ctx), token, logProb, isFinal);
|
callback(std::move(ctx), token, logProb, isFinal);
|
||||||
SPDLOG_INFO("\tStreamTokens -> Post callback");
|
SPDLOG_DEBUG("\tStreamTokens -> Post callback");
|
||||||
} else {
|
} else {
|
||||||
// TODO : Return rest::Result with error
|
// TODO : Return rest::Result with error
|
||||||
SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
|
SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
|
||||||
@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SPDLOG_INFO("Exiting StreamTokens");
|
return numTokens;
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
||||||
|
Loading…
Reference in New Issue
Block a user