From ef1876346cf73a04cc43bf0d68da5df2849ee31a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Tue, 23 Jul 2024 22:12:42 +0000
Subject: [PATCH] refactor the compute capabilities detection along with num
 gpus

---
 backends/trtllm/CMakeLists.txt     |  2 +-
 backends/trtllm/include/hardware.h | 59 ++++++++++++++++++++++++++++++
 backends/trtllm/lib/backend.cpp    | 26 +++++++------
 3 files changed, 74 insertions(+), 13 deletions(-)
 create mode 100644 backends/trtllm/include/hardware.h
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 20e95c4d..fd58f9a4 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -29,7 +29,7 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
 set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
 
 # TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h
new file mode 100644
index 00000000..da0bf4f3
--- /dev/null
+++ b/backends/trtllm/include/hardware.h
@@ -0,0 +1,59 @@
+//
+// Created by mfuntowicz on 7/23/24.
+//
+
+#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
+#define TGI_TRTLLM_BACKEND_HARDWARE_H
+
+#include <cstdint>
+#include <limits>
+#include <fmt/base.h>
+#include <spdlog/spdlog.h>
+#include <nvml.h>
+
+namespace huggingface::hardware::cuda {
+
+#define AMPERE_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 8
+
+    /**
+     * Store information about the version of the CUDA Compute Capabilities detected on the device
+     */
+    struct CudaComputeCapabilities {
+        int32_t major;
+        int32_t minor;
+
+        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+
+        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+    };
+
+    CudaComputeCapabilities GetCudaComputeCapabilities() {
+        // Get the compute capabilities of the current hardware
+        nvmlDevice_t device;
+        CudaComputeCapabilities capabilities{0, 0};
+        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
+            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
+            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
+                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
+            }
+        }
+
+        return capabilities;
+    }
+
+    /**
+     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
+     * @return
+     */
+    std::optional<size_t> GetNumDevices() {
+        uint32_t numGpus = 0;
+        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
+            return std::optional(numGpus);
+        } else {
+            return std::nullopt;
+        }
+    }
+}
+
+#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index 26728241..09d27216 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -1,30 +1,31 @@
 #include <fstream>
 
-#include <nvml.h>
 #include <fmt/ranges.h>
 #include <spdlog/spdlog.h>
+#include <nvml.h>
 
 #include "backend.h"
+#include "hardware.h"
 
 void huggingface::tgi::backends::InitializeBackend() {
     SPDLOG_INFO("Initializing Backend...");
     nvmlInit_v2();
     initTrtLlmPlugins();
+
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
+    if (numGpus.has_value()) {
+        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
+    } else {
+        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
+    }
 }
 
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
     tle::ExecutorConfig execConfig(1);
 
-    // Get the compute capabilities of the current hardware
-    nvmlDevice_t device;
-    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
-    if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
-        SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
-        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
-            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
-        }
-    }
+    // Retrieve the compute capabilities to enable some options at runtime
+    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
 
     // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
     if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
@@ -43,13 +44,13 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
                 tle::CommunicationMode::kORCHESTRATOR,
                 std::nullopt,
                 std::nullopt,
-                tle::OrchestratorConfig(true, workerPath)
+                tle::OrchestratorConfig(true, workerPath, nullptr, true)
         ));
     }
 
     // Define some configuration variables
     execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
+    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
     return execConfig;
 }
 
@@ -94,6 +95,7 @@ bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
     return executor.canEnqueueRequests();
 }
 
+[[nodiscard("Returned number of requests needs to be consumed")]]
 size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
     return executor.getNumResponsesReady();
 }