From ef1876346cf73a04cc43bf0d68da5df2849ee31a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 23 Jul 2024 22:12:42 +0000 Subject: [PATCH] refactor the compute capabilities detection along with num gpus --- backends/trtllm/CMakeLists.txt | 2 +- backends/trtllm/include/hardware.h | 59 ++++++++++++++++++++++++++++++ backends/trtllm/lib/backend.cpp | 26 +++++++------ 3 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 backends/trtllm/include/hardware.h diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 20e95c4d..fd58f9a4 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -29,7 +29,7 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..") set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE) # TGI TRTLLM Backend definition -add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp) +add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h) include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) target_include_directories(tgi_trtllm_backend_impl PRIVATE $ diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h new file mode 100644 index 00000000..da0bf4f3 --- /dev/null +++ b/backends/trtllm/include/hardware.h @@ -0,0 +1,59 @@ +// +// Created by mfuntowicz on 7/23/24. +// + +#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H +#define TGI_TRTLLM_BACKEND_HARDWARE_H + +#include +#include +#include +#include +#include + +namespace huggingface::hardware::cuda { + +#define AMPERE_SM_MAJOR 8 +#define HOPPER_SM_MAJOR 8 + + /** + * Store information about the version of the CUDA Compute Capabilities detected on the device + */ + struct CudaComputeCapabilities { + int32_t major; + int32_t minor; + + [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; } + + [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; } + }; + + CudaComputeCapabilities GetCudaComputeCapabilities() { + // Get the compute capabilities of the current hardware + nvmlDevice_t device; + CudaComputeCapabilities capabilities{0, 0}; + if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) { + SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0"); + if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) { + SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor); + } + } + + return capabilities; + } + + /** + * Return the number of GPU detected. If no GPU is detected, return size_t::max() + * @return + */ + std::optional GetNumDevices() { + uint32_t numGpus = 0; + if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) { + return std::optional(numGpus); + } else { + return std::nullopt; + } + } +} + +#endif //TGI_TRTLLM_BACKEND_HARDWARE_H diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 26728241..09d27216 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -1,30 +1,31 @@ #include -#include #include #include +#include #include "backend.h" +#include "hardware.h" void huggingface::tgi::backends::InitializeBackend() { SPDLOG_INFO("Initializing Backend..."); nvmlInit_v2(); initTrtLlmPlugins(); + + const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); + if (numGpus.has_value()) { + SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value()); + } else { + SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system"); + } } [[nodiscard]] tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { tle::ExecutorConfig execConfig(1); - // Get the compute capabilities of the current hardware - nvmlDevice_t device; - int32_t cudaComputeMajor = 0, cudaComputeMinor = 0; - if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) { - SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0"); - if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) { - SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor); - } - } + // Retrieve the compute capabilities to enable some options at runtime + const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities(); // Single engine (TP = PP = 1) -> using leader mode (no MPI involved) if (config["/pretrained_config/mapping/world_size"_json_pointer].get() == 1) { @@ -43,13 +44,13 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co tle::CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, - tle::OrchestratorConfig(true, workerPath) + tle::OrchestratorConfig(true, workerPath, nullptr, true) )); } // Define some configuration variables execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); - execConfig.setEnableChunkedContext(cudaComputeMajor >= 8); + execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere()); return execConfig; } @@ -94,6 +95,7 @@ bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const { return executor.canEnqueueRequests(); } +[[nodiscard("Returned number of requests needs to be consumed")]] size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { return executor.getNumResponsesReady(); }