mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
* misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
81 lines
3.2 KiB
C++
81 lines
3.2 KiB
C++
#ifndef TGI_HARDWARE_CUDA
|
|
#define TGI_HARDWARE_CUDA
|
|
#include <cstdint>
|
|
#include <optional>
|
|
|
|
#include <nvml.h>
|
|
|
|
namespace huggingface::tgi::hardware::cuda {
|
|
static constexpr auto VOLTA = std::make_tuple(7u, 0u);
|
|
static constexpr auto TURING = std::make_tuple(7u, 5u);
|
|
static constexpr auto AMPERE = std::make_tuple(8u, 0u);
|
|
static constexpr auto HOPPER = std::make_tuple(9u, 0u);
|
|
static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);
|
|
|
|
/**
|
|
* Get the number of GPUs on the local machine
|
|
* @return std::nullopt if no device is available, otherwise >= 1
|
|
*/
|
|
inline std::optional<size_t> get_device_count() {
|
|
uint32_t numGpus = 0;
|
|
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
|
return numGpus;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
|
*/
|
|
struct compute_capabilities_t {
|
|
int32_t major;
|
|
int32_t minor;
|
|
|
|
compute_capabilities_t(): compute_capabilities_t(0) {}
|
|
explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
|
|
nvmlDevice_t device;
|
|
if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
|
|
nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
|
|
}
|
|
};
|
|
compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}
|
|
|
|
/**
|
|
* Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
|
|
* @param sm Architecture version (major, minor)
|
|
* @return True if greater or equals to the underlying compute capabilities
|
|
*/
|
|
[[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }
|
|
|
|
/**
|
|
* Check if the capabilities match at least Volta architecture (sm_70)
|
|
* @return true if at least Volta (>= sm_70), false otherwise
|
|
*/
|
|
[[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }
|
|
|
|
/**
|
|
* Check if the capabilities match at least Turing architecture (sm_75)
|
|
* @return true if at least Turing (>= sm_75), false otherwise
|
|
*/
|
|
[[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }
|
|
|
|
/**
|
|
* Check if the capabilities match at least Ampere architecture (sm_80)
|
|
* @return true if at least Ampere (>= sm_80), false otherwise
|
|
*/
|
|
[[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }
|
|
|
|
/**
|
|
* Check if the capabilities match at least Ada Lovelace architecture (sm_89)
|
|
* @return true if at least Ada Lovelace (>= sm_89), false otherwise
|
|
*/
|
|
[[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }
|
|
|
|
/**
|
|
* Check if the capabilities match at least Hopper architecture (sm_90)
|
|
* @return true if at least Hopper (>= sm_90), false otherwise
|
|
*/
|
|
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
|
};
|
|
}
|
|
#endif |