cleaning bis

This commit is contained in:
Felix Marty 2024-01-25 18:06:58 +00:00 committed by OlivierDehaene
parent 2909047d2e
commit 359dd46474
6 changed files with 3 additions and 140 deletions

View File

@ -1,38 +0,0 @@
// !!! This is a file automatically generated by hipify!!!
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matmul_cuh
#define _q4_matmul_cuh
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/hip/HIPContext.h>
#include "../hip_func/q4_matrix.cuh"
#include "../tuning.h"
void q4_matmul_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
const Q4Matrix* w,
half* out,
bool no_zero = false,
hipStream_t alt_stream = NULL
);
void q4_matmul_recons_cuda
(
ExLlamaTuning* tuningParams,
const half* x,
const int x_height,
Q4Matrix* w,
half* out,
const hipblasHandle_t handle,
bool no_zero = false
);
#endif

View File

@ -1,54 +0,0 @@
// !!! This is a file automatically generated by hipify!!!
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
#ifndef _q4_matrix_cuh
#define _q4_matrix_cuh
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cstdint>
class Q4Matrix
{
public:
int device;
int height;
int width;
int groups;
int groupsize;
uint32_t* cuda_qweight = NULL;
uint32_t* cuda_qzeros = NULL;
half* cuda_scales = NULL;
uint32_t* cuda_x_map = NULL;
Q4Matrix
(
const int _height,
const int _width,
const int _groups,
uint32_t* _qweight,
uint32_t* _qzeros,
half* _scales,
uint32_t* _g_idx,
const int _device
);
~Q4Matrix();
void reconstruct(half* out);
private:
void make_sequential(const uint32_t* cpu_g_idx);
};
void g_q4_keep_matrix(Q4Matrix* m);
void g_q4_free_matrices();
#endif

View File

@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
{
// Reconstruct FP16 matrix, then cuBLAS
// if (!temp_dq) {
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
// cudaError_t error = cudaGetLastError();
// if (error != cudaSuccess)
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
// for (int i = 0; i < size_n * size_k; i++) {
// temp_dq_cpu[i] = 0.0f;
// }
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
// error = cudaGetLastError();
// if (error != cudaSuccess)
// printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
// }
if (!temp_dq) {
temp_dq = b->temp_dq;
b->reconstruct(temp_dq);
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
// cudaError_t error = cudaGetLastError();
// if (error != cudaSuccess)
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
// for (int i = 0; i < size_n * size_k; i++) {
// temp_dq_cpu[i] = __float2half(0.0f);
// }
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
// b->reconstruct(temp_dq);
}
//temp_dq = b->temp_dq;
//b->reconstruct(temp_dq);
if (!temp_dq) temp_dq = b->temp_dq;
b->reconstruct(temp_dq);
//cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
const half alpha = __float2half(1.0f);
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
cublasHgemm(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,

View File

@ -13,10 +13,6 @@
#include "cpp/util.h"
#include <iostream>
#include <fstream>
using namespace std;
#include <stdio.h>
// Some decluttering macros
#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)

View File

@ -680,7 +680,6 @@ class FlashCausalLM(Model):
def warmup(self, batch: FlashCausalLMBatch):
torch.cuda.empty_cache()
try:
cache_manager = set_cache_manager(
batch.blocks,
@ -698,7 +697,6 @@ class FlashCausalLM(Model):
f"You need to decrease `--max-batch-prefill-tokens`"
) from e
torch.cuda.synchronize(self.device)
# Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)

View File

@ -71,7 +71,6 @@ class Model(ABC):
raise NotImplementedError
def warmup(self, batch: B) -> Optional[int]:
logger.info("in this warmup model.py")
self.generate_token(batch)
return None