From bec188ff73f3d52793aa853402c5169a2543a2b5 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Mon, 8 Jul 2024 22:32:41 +0000
Subject: [PATCH] bind to CUDA::nvml to retrieve compute capabilities at
 runtime

---
 backends/trtllm/CMakeLists.txt  |  4 +++-
 backends/trtllm/lib/backend.cpp | 17 +++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index ff0cb766..ab06175f 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -17,6 +17,8 @@ include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
+find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml)
+
 # TGI TRTLLM Backend definition
 add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
@@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
 )
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
-target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml)
 
 
 #### Unit Tests ####
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index ed9c685a..8e2cec82 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -1,11 +1,12 @@
 #include <fmt/std.h>
+#include <nvml.h>
 #include <spdlog/spdlog.h>
 
 #include "backend.h"
 
 void huggingface::tgi::backends::InitializeBackend() {
     SPDLOG_INFO("Initializing Backend...");
-
+    nvmlInit_v2();
     initTrtLlmPlugins();
 }
 
@@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
     tle::ExecutorConfig execConfig(1);
 
     // TODO : Need to check for >= sm_80 (ampere)
-    // execConfig.setEnableChunkedContext(true)
+    nvmlDevice_t device;
+    int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor;
+
+    if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
+        if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) {
+            SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor);
+            execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
+        }
+    }
     execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
 
     if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
@@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
         std::optional<uint32_t> nTopTokens
 ) {
     spdlog::debug(
-            "Submitting inference over {:d} tokens to the executor {:d}",
+            FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"),
             tokens.size(),
             executor.getLatestIterationStats().back().numActiveRequests
     );
@@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
 }
 
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
-    SPDLOG_DEBUG("Polling request {:d}", reqId);
+    SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId);
     const auto responses = executor.awaitResponses(reqId);
     return responses;
 }
\ No newline at end of file