mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
cleaning bis
This commit is contained in:
parent
2909047d2e
commit
359dd46474
@ -1,38 +0,0 @@
|
|||||||
// !!! This is a file automatically generated by hipify!!!
|
|
||||||
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
|
||||||
|
|
||||||
#ifndef _q4_matmul_cuh
|
|
||||||
#define _q4_matmul_cuh
|
|
||||||
|
|
||||||
#include <hip/hip_runtime.h>
|
|
||||||
#include <hip/hip_fp16.h>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <ATen/hip/HIPContext.h>
|
|
||||||
|
|
||||||
#include "../hip_func/q4_matrix.cuh"
|
|
||||||
#include "../tuning.h"
|
|
||||||
|
|
||||||
void q4_matmul_cuda
|
|
||||||
(
|
|
||||||
ExLlamaTuning* tuningParams,
|
|
||||||
const half* x,
|
|
||||||
const int x_height,
|
|
||||||
const Q4Matrix* w,
|
|
||||||
half* out,
|
|
||||||
bool no_zero = false,
|
|
||||||
hipStream_t alt_stream = NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
void q4_matmul_recons_cuda
|
|
||||||
(
|
|
||||||
ExLlamaTuning* tuningParams,
|
|
||||||
const half* x,
|
|
||||||
const int x_height,
|
|
||||||
Q4Matrix* w,
|
|
||||||
half* out,
|
|
||||||
const hipblasHandle_t handle,
|
|
||||||
bool no_zero = false
|
|
||||||
);
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,54 +0,0 @@
|
|||||||
// !!! This is a file automatically generated by hipify!!!
|
|
||||||
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
|
||||||
|
|
||||||
#ifndef _q4_matrix_cuh
|
|
||||||
#define _q4_matrix_cuh
|
|
||||||
|
|
||||||
#include <hip/hip_runtime.h>
|
|
||||||
#include <hip/hip_fp16.h>
|
|
||||||
#include <cstdint>
|
|
||||||
|
|
||||||
class Q4Matrix
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
int device;
|
|
||||||
|
|
||||||
int height;
|
|
||||||
int width;
|
|
||||||
int groups;
|
|
||||||
int groupsize;
|
|
||||||
|
|
||||||
uint32_t* cuda_qweight = NULL;
|
|
||||||
uint32_t* cuda_qzeros = NULL;
|
|
||||||
half* cuda_scales = NULL;
|
|
||||||
uint32_t* cuda_x_map = NULL;
|
|
||||||
|
|
||||||
Q4Matrix
|
|
||||||
(
|
|
||||||
const int _height,
|
|
||||||
const int _width,
|
|
||||||
const int _groups,
|
|
||||||
|
|
||||||
uint32_t* _qweight,
|
|
||||||
uint32_t* _qzeros,
|
|
||||||
half* _scales,
|
|
||||||
uint32_t* _g_idx,
|
|
||||||
|
|
||||||
const int _device
|
|
||||||
);
|
|
||||||
|
|
||||||
~Q4Matrix();
|
|
||||||
|
|
||||||
void reconstruct(half* out);
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
void make_sequential(const uint32_t* cpu_g_idx);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
void g_q4_keep_matrix(Q4Matrix* m);
|
|
||||||
void g_q4_free_matrices();
|
|
||||||
|
|
||||||
#endif
|
|
@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
|
|||||||
{
|
{
|
||||||
// Reconstruct FP16 matrix, then cuBLAS
|
// Reconstruct FP16 matrix, then cuBLAS
|
||||||
|
|
||||||
// if (!temp_dq) {
|
if (!temp_dq) temp_dq = b->temp_dq;
|
||||||
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
|
b->reconstruct(temp_dq);
|
||||||
|
|
||||||
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
|
|
||||||
// cudaError_t error = cudaGetLastError();
|
|
||||||
// if (error != cudaSuccess)
|
|
||||||
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
|
|
||||||
|
|
||||||
// for (int i = 0; i < size_n * size_k; i++) {
|
|
||||||
// temp_dq_cpu[i] = 0.0f;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
|
|
||||||
// error = cudaGetLastError();
|
|
||||||
// if (error != cudaSuccess)
|
|
||||||
// printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (!temp_dq) {
|
|
||||||
temp_dq = b->temp_dq;
|
|
||||||
b->reconstruct(temp_dq);
|
|
||||||
|
|
||||||
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
|
|
||||||
|
|
||||||
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
|
|
||||||
// cudaError_t error = cudaGetLastError();
|
|
||||||
// if (error != cudaSuccess)
|
|
||||||
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
|
|
||||||
|
|
||||||
// for (int i = 0; i < size_n * size_k; i++) {
|
|
||||||
// temp_dq_cpu[i] = __float2half(0.0f);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
|
|
||||||
// b->reconstruct(temp_dq);
|
|
||||||
}
|
|
||||||
|
|
||||||
//temp_dq = b->temp_dq;
|
|
||||||
//b->reconstruct(temp_dq);
|
|
||||||
|
|
||||||
//cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
|
//cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
|
||||||
|
|
||||||
const half alpha = __float2half(1.0f);
|
const half alpha = __float2half(1.0f);
|
||||||
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
|
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
|
||||||
|
|
||||||
cublasHgemm(cublas_handle,
|
cublasHgemm(cublas_handle,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
|
@ -13,10 +13,6 @@
|
|||||||
|
|
||||||
#include "cpp/util.h"
|
#include "cpp/util.h"
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
using namespace std;
|
|
||||||
#include <stdio.h>
|
|
||||||
// Some decluttering macros
|
// Some decluttering macros
|
||||||
|
|
||||||
#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
|
#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
|
||||||
|
@ -680,7 +680,6 @@ class FlashCausalLM(Model):
|
|||||||
|
|
||||||
def warmup(self, batch: FlashCausalLMBatch):
|
def warmup(self, batch: FlashCausalLMBatch):
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cache_manager = set_cache_manager(
|
cache_manager = set_cache_manager(
|
||||||
batch.blocks,
|
batch.blocks,
|
||||||
@ -698,7 +697,6 @@ class FlashCausalLM(Model):
|
|||||||
f"You need to decrease `--max-batch-prefill-tokens`"
|
f"You need to decrease `--max-batch-prefill-tokens`"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
|
||||||
torch.cuda.synchronize(self.device)
|
torch.cuda.synchronize(self.device)
|
||||||
|
|
||||||
# Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
|
# Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
|
||||||
|
@ -71,7 +71,6 @@ class Model(ABC):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def warmup(self, batch: B) -> Optional[int]:
|
def warmup(self, batch: B) -> Optional[int]:
|
||||||
logger.info("in this warmup model.py")
|
|
||||||
self.generate_token(batch)
|
self.generate_token(batch)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user