compute the number of maximum new tokens for each request independently

This commit is contained in:
Morgan Funtowicz 2024-07-17 13:55:29 +00:00
parent a01cd030d4
commit 9220340ff7
4 changed files with 73 additions and 47 deletions

View File

@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor;
namespace huggingface::tgi::backends {
using RequestId = tle::IdType;
using TokenId = tle::TokenIdType;
using TokenStreamingCallback = void(tle::TokenIdType);
/**
* Initialize all the components required by TRTLLM.
@ -38,6 +37,23 @@ namespace huggingface::tgi::backends {
*/
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
/**
* Get the sampling configuration from the parameters provided by TGI
* @param topK
* @param topP
* @param temperature
* @param seed
* @param beamWidth
* @return
*/
tle::SamplingConfig GetSamplingConfig(
uint32_t topK,
float_t topP,
float_t temperature,
uint64_t seed,
std::optional<int32_t> beamWidth
);
/**
*
*/
@ -52,19 +68,19 @@ namespace huggingface::tgi::backends {
const std::filesystem::path &executorWorker
);
/***
/**
* Indicate if the backend is ready to accept incoming request
* @return true if ready, false otherwise
*/
[[nodiscard]] bool IsReady() const;
/***
/**
* Query the executor for the number of token available for pulling
* @return
*/
[[nodiscard]] size_t NumResponsesReady() const;
/***
/**
* Submit a new generation task to the executor
* @param tokens
* @param maxNewTokens
@ -82,14 +98,14 @@ namespace huggingface::tgi::backends {
uint64_t seed
);
/***
/**
*
* @param requestId The request id to poll the generation results
* @return
*/
std::vector<tle::Response> Poll(RequestId requestId);
/***
/**
* Stop the underlying executor
*/
void Shutdown();

View File

@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
// Get the compute capabilities of the current hardware
nvmlDevice_t device;
int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0;
int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) ==
NVML_SUCCESS) {
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor,
cudaComputeCapabilitiesMinor);
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
}
}
@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
// Define some configuration variables
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
return execConfig;
}
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
uint32_t topK,
float_t topP,
float_t temperature,
uint64_t seed,
std::optional<int32_t> beamWidth = std::nullopt) {
return tle::SamplingConfig(
beamWidth.value_or(1),
topK,
topP,
std::nullopt,
std::nullopt,
std::nullopt,
seed,
std::nullopt,
temperature,
std::nullopt
);
}
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
const std::filesystem::path &enginesFolder,
const std::filesystem::path &executorWorker
@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
const float_t temperature,
const uint64_t seed
) {
#ifndef NDEBUG
SPDLOG_INFO(
#ifdef NDEBUG
SPDLOG_DEBUG(
FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
tokens.size(),
executor.getLatestIterationStats().back().numActiveRequests
);
#else
SPDLOG_INFO(
SPDLOG_DEBUG(
FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
fmt::join(tokens, ", "),
executor.getLatestIterationStats().back().numActiveRequests
executor.getLatestIterationStats().front().numActiveRequests
);
#endif
const auto sampling = tle::SamplingConfig{
1,
topK,
topP,
std::nullopt,
std::nullopt,
std::nullopt,
seed,
std::nullopt,
temperature,
std::nullopt,
};
const auto output = tle::OutputConfig{false, false, false};
const auto maxNumTokens = config["max_num_tokens"_json_pointer].get<size_t>();
const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
const auto sampling = GetSamplingConfig(topK, topP, temperature, seed);
const auto output = tle::OutputConfig(false, false, false, true, false);
return executor.enqueueRequest(
tle::Request{tokens, std::numeric_limits<tle::SizeType32>::max(), true, sampling, output});
tle::Request{tokens, maxNewTokens, true, sampling, output});
}
[[nodiscard("Generated tokens result must be used")]]
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
SPDLOG_INFO("Polling status for request {}", requestId);
SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
return executor.awaitResponses(requestId);
}

View File

@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend {
);
info!("Releasing lock for submit");
return request_id;
request_id
})
.await;

View File

@ -4,7 +4,9 @@
#pragma once
#include <cmath>
#include <exception>
#include <filesystem>
#include <iterator>
#include <vector>
#include <spdlog/spdlog.h>
@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) {
// This will copy all the items from the initial slice
std::vector<int32_t> tokens_(tokens.size());
tokens_.assign(tokens.begin(), tokens.end());
std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed);
}
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId,
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
const uint64_t requestId,
rust::Box<huggingface::tgi::backends::GenerationContext> ctx,
rust::Fn<void(rust::Box<huggingface::tgi::backends::GenerationContext>, uint32_t, float_t, bool)> callback) {
SPDLOG_INFO("Entering StreamTokens");
size_t numTokens = 0;
for (const auto &item: Poll(requestId)) {
if (!item.hasError()) {
SPDLOG_INFO("\tStreamTokens -> Decoding token...");
SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
const auto decoded = item.getResult();
SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size());
const auto token = decoded.outputTokenIds[0][0];
const auto isFinal = decoded.isFinal;
// const auto logProb = decoded.logProbs.value()[0][0];
const auto logProb = 0.0;
const auto logProb = decoded.logProbs.value()[0][0];
SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
++numTokens;
SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
callback(std::move(ctx), token, logProb, isFinal);
SPDLOG_INFO("\tStreamTokens -> Post callback");
SPDLOG_DEBUG("\tStreamTokens -> Post callback");
} else {
// TODO : Return rest::Result with error
SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg());
@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui
}
}
SPDLOG_INFO("Exiting StreamTokens");
return 0;
return numTokens;
}
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>