diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index b19b5d7e..d84bc253 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -22,7 +22,6 @@ namespace tle = tensorrt_llm::executor; namespace huggingface::tgi::backends { using RequestId = tle::IdType; using TokenId = tle::TokenIdType; - using TokenStreamingCallback = void(tle::TokenIdType); /** * Initialize all the components required by TRTLLM. @@ -38,6 +37,23 @@ namespace huggingface::tgi::backends { */ tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); + /** + * Get the sampling configuration from the parameters provided by TGI + * @param topK + * @param topP + * @param temperature + * @param seed + * @param beamWidth + * @return + */ + tle::SamplingConfig GetSamplingConfig( + uint32_t topK, + float_t topP, + float_t temperature, + uint64_t seed, + std::optional beamWidth + ); + /** * */ @@ -52,19 +68,19 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); - /*** + /** * Indicate if the backend is ready to accept incoming request * @return true if ready, false otherwise */ [[nodiscard]] bool IsReady() const; - /*** + /** * Query the executor for the number of token available for pulling * @return */ [[nodiscard]] size_t NumResponsesReady() const; - /*** + /** * Submit a new generation task to the executor * @param tokens * @param maxNewTokens @@ -82,14 +98,14 @@ namespace huggingface::tgi::backends { uint64_t seed ); - /*** + /** * * @param requestId The request id to poll the generation results * @return */ std::vector Poll(RequestId requestId); - /*** + /** * Stop the underlying executor */ void Shutdown(); diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index aca718c4..161dea5a 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -18,13 +18,11 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co // Get the compute capabilities of the current hardware nvmlDevice_t device; - int32_t cudaComputeCapabilitiesMajor = 0, cudaComputeCapabilitiesMinor = 0; + int32_t cudaComputeMajor = 0, cudaComputeMinor = 0; if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) { SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0"); - if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == - NVML_SUCCESS) { - SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, - cudaComputeCapabilitiesMinor); + if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) { + SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor); } } @@ -51,10 +49,30 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co // Define some configuration variables execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); - execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8); + execConfig.setEnableChunkedContext(cudaComputeMajor >= 8); return execConfig; } +tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( + uint32_t topK, + float_t topP, + float_t temperature, + uint64_t seed, + std::optional beamWidth = std::nullopt) { + return tle::SamplingConfig( + beamWidth.value_or(1), + topK, + topP, + std::nullopt, + std::nullopt, + std::nullopt, + seed, + std::nullopt, + temperature, + std::nullopt + ); +} + huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( const std::filesystem::path &enginesFolder, const std::filesystem::path &executorWorker @@ -84,40 +102,32 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const float_t temperature, const uint64_t seed ) { -#ifndef NDEBUG - SPDLOG_INFO( +#ifdef NDEBUG + SPDLOG_DEBUG( FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"), tokens.size(), executor.getLatestIterationStats().back().numActiveRequests ); #else - SPDLOG_INFO( + SPDLOG_DEBUG( FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"), fmt::join(tokens, ", "), - executor.getLatestIterationStats().back().numActiveRequests + executor.getLatestIterationStats().front().numActiveRequests ); #endif - const auto sampling = tle::SamplingConfig{ - 1, - topK, - topP, - std::nullopt, - std::nullopt, - std::nullopt, - seed, - std::nullopt, - temperature, - std::nullopt, - }; - const auto output = tle::OutputConfig{false, false, false}; + const auto maxNumTokens = config["max_num_tokens"_json_pointer].get(); + const auto maxNewTokens = static_cast(std::max(1ul, maxNumTokens - tokens.size())); + + const auto sampling = GetSamplingConfig(topK, topP, temperature, seed); + const auto output = tle::OutputConfig(false, false, false, true, false); return executor.enqueueRequest( - tle::Request{tokens, std::numeric_limits::max(), true, sampling, output}); + tle::Request{tokens, maxNewTokens, true, sampling, output}); } [[nodiscard("Generated tokens result must be used")]] std::vector huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) { - SPDLOG_INFO("Polling status for request {}", requestId); + SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId); return executor.awaitResponses(requestId); } diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs index b59e2006..d3f56ad9 100644 --- a/backends/trtllm/src/backend.rs +++ b/backends/trtllm/src/backend.rs @@ -156,7 +156,7 @@ impl Backend for TensorRtLlmBackend { ); info!("Releasing lock for submit"); - return request_id; + request_id }) .await; diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 2920eda0..43d6c9f2 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -4,7 +4,9 @@ #pragma once #include +#include #include +#include #include #include @@ -25,31 +27,30 @@ uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( rust::Slice tokens, int32_t topK, float_t topP, float_t temperature, uint64_t seed) { // This will copy all the items from the initial slice - std::vector tokens_(tokens.size()); - tokens_.assign(tokens.begin(), tokens.end()); - + std::vector tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end())); return TensorRtLlmBackend::Submit(std::move(tokens_), topK, topP, temperature, seed); } -size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const uint64_t requestId, - rust::Box ctx, - rust::Fn, uint32_t, float_t, bool)> callback) { +size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens( + const uint64_t requestId, + rust::Box ctx, + rust::Fn, uint32_t, float_t, bool)> callback) { - SPDLOG_INFO("Entering StreamTokens"); + size_t numTokens = 0; for (const auto &item: Poll(requestId)) { if (!item.hasError()) { - SPDLOG_INFO("\tStreamTokens -> Decoding token..."); + SPDLOG_DEBUG("\tStreamTokens -> Decoding token..."); const auto decoded = item.getResult(); - SPDLOG_INFO("\tStreamTokens -> Successfully read decoded token ({})", decoded.outputTokenIds[0].size()); const auto token = decoded.outputTokenIds[0][0]; const auto isFinal = decoded.isFinal; -// const auto logProb = decoded.logProbs.value()[0][0]; - const auto logProb = 0.0; + const auto logProb = decoded.logProbs.value()[0][0]; - SPDLOG_INFO(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal); + ++numTokens; + + SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal); callback(std::move(ctx), token, logProb, isFinal); - SPDLOG_INFO("\tStreamTokens -> Post callback"); + SPDLOG_DEBUG("\tStreamTokens -> Post callback"); } else { // TODO : Return rest::Result with error SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", item.getErrorMsg()); @@ -57,8 +58,7 @@ size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(const ui } } - SPDLOG_INFO("Exiting StreamTokens"); - return 0; + return numTokens; } std::unique_ptr