mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
* feat(trtllm): rewrite health to not account for current state * chore(looper): cleanup a bit more * feat(post_processing): max_new_tokens is const evaluated now * chore(ffi):formatting * feat(trtllm): add stop words handling # Conflicts: # backends/trtllm/lib/backend.cpp * chore(trtllm): create specific parallelconfig factory and logging init methods * chore(trtllm): define a macro for SizeType cast * chore(trtllm): use GetParallelConfig * chore(trtllm): minor refactoring * chore(trtllm): validate there are enough GPus on the system for the desired model * chore(trtllm): ensure max throughput scheduling policy is selected * chore(trtllm): minor fix * chore(router): minor refactorings * feat(docker): build with-slurm ompi * feat(docker): add python3.10 dev to runtime deps * chore(docker): add mpi to ld_library_path * chore(docker): install transformers * feat(trtllm): detect stop_words from generation_config.json
60 lines
1.7 KiB
C++
60 lines
1.7 KiB
C++
//
|
|
// Created by mfuntowicz on 7/23/24.
|
|
//
|
|
|
|
#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
|
|
#define TGI_TRTLLM_BACKEND_HARDWARE_H
|
|
|
|
#include <cstdint>
|
|
#include <limits>
|
|
#include <fmt/base.h>
|
|
#include <spdlog/spdlog.h>
|
|
#include <nvml.h>
|
|
|
|
namespace huggingface::hardware::cuda {
|
|
|
|
#define AMPERE_SM_MAJOR 8
|
|
#define HOPPER_SM_MAJOR 9
|
|
|
|
/**
|
|
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
|
*/
|
|
struct CudaComputeCapabilities {
|
|
int32_t major;
|
|
int32_t minor;
|
|
|
|
[[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
|
|
|
|
[[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
|
|
};
|
|
|
|
CudaComputeCapabilities GetCudaComputeCapabilities() {
|
|
// Get the compute capabilities of the current hardware
|
|
nvmlDevice_t device;
|
|
CudaComputeCapabilities capabilities{0, 0};
|
|
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
|
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
|
if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
|
|
SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
|
|
}
|
|
}
|
|
|
|
return capabilities;
|
|
}
|
|
|
|
/**
|
|
* Return the number of GPU detected. If no GPU is detected, return size_t::max()
|
|
* @return
|
|
*/
|
|
std::optional<size_t> GetNumDevices() {
|
|
uint32_t numGpus = 0;
|
|
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
|
return std::optional(numGpus);
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
|