From bec188ff73f3d52793aa853402c5169a2543a2b5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 8 Jul 2024 22:32:41 +0000 Subject: [PATCH] bind to CUDA::nvml to retrieve compute capabilities at runtime --- backends/trtllm/CMakeLists.txt | 4 +++- backends/trtllm/lib/backend.cpp | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index ff0cb766..ab06175f 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -17,6 +17,8 @@ include(cmake/json.cmake) include(cmake/spdlog.cmake) include(cmake/trtllm.cmake) +find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml) + # TGI TRTLLM Backend definition add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp) target_include_directories(tgi_trtllm_backend_impl PRIVATE @@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE ) include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm) -target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog) +target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml) #### Unit Tests #### diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index ed9c685a..8e2cec82 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -1,11 +1,12 @@ #include +#include #include #include "backend.h" void huggingface::tgi::backends::InitializeBackend() { SPDLOG_INFO("Initializing Backend..."); - + nvmlInit_v2(); initTrtLlmPlugins(); } @@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co tle::ExecutorConfig execConfig(1); // TODO : Need to check for >= sm_80 (ampere) - // execConfig.setEnableChunkedContext(true) + nvmlDevice_t device; + int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor; + + if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) { + if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) { + SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor); + execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8); + } + } execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); if(config["/pretrained_config/mapping/world_size"_json_pointer].get() == 1){ @@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( std::optional nTopTokens ) { spdlog::debug( - "Submitting inference over {:d} tokens to the executor {:d}", + FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"), tokens.size(), executor.getLatestIterationStats().back().numActiveRequests ); @@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( } std::vector huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) { - SPDLOG_DEBUG("Polling request {:d}", reqId); + SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId); const auto responses = executor.awaitResponses(reqId); return responses; } \ No newline at end of file