mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-27 04:52:07 +00:00
Use FP8 GPTQ-Marlin kernels to enable FP8 support on CUDA GPUs with compute capability >=8.0 and <8.9. Co-authored-by: Florian Zimmermeister <flozi00.fz@gmail.com>
36 lines
1.6 KiB
C++
36 lines
1.6 KiB
C++
#pragma once
|
|
|
|
#include <torch/library.h>
|
|
|
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
|
// No support for async
|
|
#else
|
|
|
|
torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
|
|
torch::Tensor &b_scales, torch::Tensor &g_idx,
|
|
torch::Tensor &perm, torch::Tensor &workspace,
|
|
int64_t num_bits, int64_t size_m, int64_t size_n,
|
|
int64_t size_k, bool is_k_full);
|
|
|
|
torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
|
|
torch::Tensor &b_meta,
|
|
torch::Tensor &b_scales,
|
|
torch::Tensor &workspace, int64_t num_bits,
|
|
int64_t size_m, int64_t size_n,
|
|
int64_t size_k);
|
|
|
|
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
|
|
int64_t size_k, int64_t size_n,
|
|
int64_t num_bits);
|
|
|
|
torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
|
|
torch::Tensor &b_scales, torch::Tensor &workspace,
|
|
int64_t size_m, int64_t size_n, int64_t size_k);
|
|
|
|
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
|
torch::Tensor& b_scales, torch::Tensor& workspace,
|
|
int64_t num_bits, int64_t size_m, int64_t size_n,
|
|
int64_t size_k);
|
|
|
|
#endif
|