refactor the compute capabilities detection along with num gpus

This commit is contained in:
Morgan Funtowicz 2024-07-23 22:12:42 +00:00
parent 3c39ab5ac8
commit ef1876346c
3 changed files with 74 additions and 13 deletions

View File

@ -29,7 +29,7 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE) set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
# TGI TRTLLM Backend definition # TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp) add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE target_include_directories(tgi_trtllm_backend_impl PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>

View File

@ -0,0 +1,59 @@
//
// Created by mfuntowicz on 7/23/24.
//
#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
#define TGI_TRTLLM_BACKEND_HARDWARE_H
#include <cstdint>
#include <limits>
#include <fmt/base.h>
#include <spdlog/spdlog.h>
#include <nvml.h>
namespace huggingface::hardware::cuda {
#define AMPERE_SM_MAJOR 8
#define HOPPER_SM_MAJOR 8
/**
* Store information about the version of the CUDA Compute Capabilities detected on the device
*/
struct CudaComputeCapabilities {
int32_t major;
int32_t minor;
[[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
[[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
};
CudaComputeCapabilities GetCudaComputeCapabilities() {
// Get the compute capabilities of the current hardware
nvmlDevice_t device;
CudaComputeCapabilities capabilities{0, 0};
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
}
}
return capabilities;
}
/**
* Return the number of GPU detected. If no GPU is detected, return size_t::max()
* @return
*/
std::optional<size_t> GetNumDevices() {
uint32_t numGpus = 0;
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
return std::optional(numGpus);
} else {
return std::nullopt;
}
}
}
#endif //TGI_TRTLLM_BACKEND_HARDWARE_H

View File

@ -1,30 +1,31 @@
#include <fstream> #include <fstream>
#include <nvml.h>
#include <fmt/ranges.h> #include <fmt/ranges.h>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <nvml.h>
#include "backend.h" #include "backend.h"
#include "hardware.h"
void huggingface::tgi::backends::InitializeBackend() { void huggingface::tgi::backends::InitializeBackend() {
SPDLOG_INFO("Initializing Backend..."); SPDLOG_INFO("Initializing Backend...");
nvmlInit_v2(); nvmlInit_v2();
initTrtLlmPlugins(); initTrtLlmPlugins();
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
if (numGpus.has_value()) {
SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
} else {
SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
}
} }
[[nodiscard]] [[nodiscard]]
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
tle::ExecutorConfig execConfig(1); tle::ExecutorConfig execConfig(1);
// Get the compute capabilities of the current hardware // Retrieve the compute capabilities to enable some options at runtime
nvmlDevice_t device; const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
int32_t cudaComputeMajor = 0, cudaComputeMinor = 0;
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
if (nvmlDeviceGetCudaComputeCapability(device, &cudaComputeMajor, &cudaComputeMinor) == NVML_SUCCESS) {
SPDLOG_DEBUG(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeMajor, cudaComputeMinor);
}
}
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved) // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) { if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
@ -43,13 +44,13 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
tle::CommunicationMode::kORCHESTRATOR, tle::CommunicationMode::kORCHESTRATOR,
std::nullopt, std::nullopt,
std::nullopt, std::nullopt,
tle::OrchestratorConfig(true, workerPath) tle::OrchestratorConfig(true, workerPath, nullptr, true)
)); ));
} }
// Define some configuration variables // Define some configuration variables
execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
execConfig.setEnableChunkedContext(cudaComputeMajor >= 8); execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
return execConfig; return execConfig;
} }
@ -94,6 +95,7 @@ bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
return executor.canEnqueueRequests(); return executor.canEnqueueRequests();
} }
[[nodiscard("Returned number of requests needs to be consumed")]]
size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
return executor.getNumResponsesReady(); return executor.getNumResponsesReady();
} }