refactor the compute capabilities detection along with num gpus

2025-09-09 03:14:53 +00:00 · 2024-07-23 22:12:42 +00:00 · 2024-07-23 22:12:42 +00:00 · ef1876346c
commit ef1876346c
parent 3c39ab5ac8
3 changed files with 74 additions and 13 deletions
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -29,7 +29,7 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
 set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
 # TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
+add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@ -0,0 +1,59 @@
 //
 // Created by mfuntowicz on 7/23/24.
 //
 #ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
 #define TGI_TRTLLM_BACKEND_HARDWARE_H
 #include <cstdint>
 #include <limits>
 #include <fmt/base.h>
 #include <spdlog/spdlog.h>
 #include <nvml.h>
 namespace huggingface::hardware::cuda {
 #define AMPERE_SM_MAJOR 8
 #define HOPPER_SM_MAJOR 8
    /**
     * Store information about the version of the CUDA Compute Capabilities detected on the device
     */
    struct CudaComputeCapabilities {
        int32_t major;
        int32_t minor;
        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
    };
    CudaComputeCapabilities GetCudaComputeCapabilities() {
        // Get the compute capabilities of the current hardware
        nvmlDevice_t device;
        CudaComputeCapabilities capabilities{0, 0};
        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
            }
        }
        return capabilities;
    }
    /**
     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
     * @return
     */
    std::optional<size_t> GetNumDevices() {
        uint32_t numGpus = 0;
        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
            return std::optional(numGpus);
        } else {
            return std::nullopt;
        }
    }
 }
 #endif //TGI_TRTLLM_BACKEND_HARDWARE_H
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,30 +1,31 @@
 #include <fstream>
 #include <nvml.h>
 #include <fmt/ranges.h>
 #include <spdlog/spdlog.h>
 #include <nvml.h>
 #include "backend.h"
 #include "hardware.h"
 void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
    nvmlInit_v2();
    initTrtLlmPlugins();
    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
    if (numGpus.has_value()) {
        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
    } else {
        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
    }
 }
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
    tle::ExecutorConfig execConfig(1);
-    // Get the compute capabilities of the current hardware
+    // Retrieve the compute capabilities to enable some options at runtime
-    nvmlDevice_t device;
+    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
    int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
    if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
        SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
        if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
            SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
        }
    }
    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
@ -43,13 +44,13 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
                tle::CommunicationMode::kORCHESTRATOR,
                std::nullopt,
                std::nullopt,
-                tle::OrchestratorConfig(true, workerPath)
+                tle::OrchestratorConfig(true, workerPath, nullptr, true)
        ));
    }
    // Define some configuration variables
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(cudaComputeMajor >= 8);
+    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
    return execConfig;
 }
@ -94,6 +95,7 @@ bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
    return executor.canEnqueueRequests();
 }
 [[nodiscard("Returned number of requests needs to be consumed")]]
 size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
    return executor.getNumResponsesReady();
 }