mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
cleaning bis
This commit is contained in:
parent
2909047d2e
commit
359dd46474
@ -1,38 +0,0 @@
|
||||
// !!! This is a file automatically generated by hipify!!!
|
||||
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
||||
|
||||
#ifndef _q4_matmul_cuh
|
||||
#define _q4_matmul_cuh
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <ATen/hip/HIPContext.h>
|
||||
|
||||
#include "../hip_func/q4_matrix.cuh"
|
||||
#include "../tuning.h"
|
||||
|
||||
void q4_matmul_cuda
|
||||
(
|
||||
ExLlamaTuning* tuningParams,
|
||||
const half* x,
|
||||
const int x_height,
|
||||
const Q4Matrix* w,
|
||||
half* out,
|
||||
bool no_zero = false,
|
||||
hipStream_t alt_stream = NULL
|
||||
);
|
||||
|
||||
void q4_matmul_recons_cuda
|
||||
(
|
||||
ExLlamaTuning* tuningParams,
|
||||
const half* x,
|
||||
const int x_height,
|
||||
Q4Matrix* w,
|
||||
half* out,
|
||||
const hipblasHandle_t handle,
|
||||
bool no_zero = false
|
||||
);
|
||||
|
||||
#endif
|
@ -1,54 +0,0 @@
|
||||
// !!! This is a file automatically generated by hipify!!!
|
||||
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
||||
|
||||
#ifndef _q4_matrix_cuh
|
||||
#define _q4_matrix_cuh
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <cstdint>
|
||||
|
||||
class Q4Matrix
|
||||
{
|
||||
public:
|
||||
|
||||
int device;
|
||||
|
||||
int height;
|
||||
int width;
|
||||
int groups;
|
||||
int groupsize;
|
||||
|
||||
uint32_t* cuda_qweight = NULL;
|
||||
uint32_t* cuda_qzeros = NULL;
|
||||
half* cuda_scales = NULL;
|
||||
uint32_t* cuda_x_map = NULL;
|
||||
|
||||
Q4Matrix
|
||||
(
|
||||
const int _height,
|
||||
const int _width,
|
||||
const int _groups,
|
||||
|
||||
uint32_t* _qweight,
|
||||
uint32_t* _qzeros,
|
||||
half* _scales,
|
||||
uint32_t* _g_idx,
|
||||
|
||||
const int _device
|
||||
);
|
||||
|
||||
~Q4Matrix();
|
||||
|
||||
void reconstruct(half* out);
|
||||
|
||||
private:
|
||||
|
||||
void make_sequential(const uint32_t* cpu_g_idx);
|
||||
|
||||
};
|
||||
|
||||
void g_q4_keep_matrix(Q4Matrix* m);
|
||||
void g_q4_free_matrices();
|
||||
|
||||
#endif
|
@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
|
||||
{
|
||||
// Reconstruct FP16 matrix, then cuBLAS
|
||||
|
||||
// if (!temp_dq) {
|
||||
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
|
||||
|
||||
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
|
||||
// cudaError_t error = cudaGetLastError();
|
||||
// if (error != cudaSuccess)
|
||||
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
|
||||
|
||||
// for (int i = 0; i < size_n * size_k; i++) {
|
||||
// temp_dq_cpu[i] = 0.0f;
|
||||
// }
|
||||
|
||||
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
|
||||
// error = cudaGetLastError();
|
||||
// if (error != cudaSuccess)
|
||||
// printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
|
||||
// }
|
||||
|
||||
if (!temp_dq) {
|
||||
temp_dq = b->temp_dq;
|
||||
b->reconstruct(temp_dq);
|
||||
|
||||
// half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
|
||||
|
||||
// cudaMalloc(&temp_dq, size_n * size_k * sizeof(half));
|
||||
// cudaError_t error = cudaGetLastError();
|
||||
// if (error != cudaSuccess)
|
||||
// printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
|
||||
|
||||
// for (int i = 0; i < size_n * size_k; i++) {
|
||||
// temp_dq_cpu[i] = __float2half(0.0f);
|
||||
// }
|
||||
|
||||
// cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
|
||||
// b->reconstruct(temp_dq);
|
||||
}
|
||||
if (!temp_dq) temp_dq = b->temp_dq;
|
||||
b->reconstruct(temp_dq);
|
||||
|
||||
//temp_dq = b->temp_dq;
|
||||
//b->reconstruct(temp_dq);
|
||||
|
||||
//cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
|
||||
|
||||
const half alpha = __float2half(1.0f);
|
||||
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
|
||||
|
||||
cublasHgemm(cublas_handle,
|
||||
CUBLAS_OP_N,
|
||||
CUBLAS_OP_N,
|
||||
|
@ -13,10 +13,6 @@
|
||||
|
||||
#include "cpp/util.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
using namespace std;
|
||||
#include <stdio.h>
|
||||
// Some decluttering macros
|
||||
|
||||
#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
|
||||
@ -118,7 +114,7 @@ void gemm_half_q_half
|
||||
TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
|
||||
|
||||
|
||||
gemm_half_q_half_cuda
|
||||
(
|
||||
at::cuda::getCurrentCUDABlasHandle(),
|
||||
|
@ -680,7 +680,6 @@ class FlashCausalLM(Model):
|
||||
|
||||
def warmup(self, batch: FlashCausalLMBatch):
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
try:
|
||||
cache_manager = set_cache_manager(
|
||||
batch.blocks,
|
||||
@ -698,7 +697,6 @@ class FlashCausalLM(Model):
|
||||
f"You need to decrease `--max-batch-prefill-tokens`"
|
||||
) from e
|
||||
|
||||
|
||||
torch.cuda.synchronize(self.device)
|
||||
|
||||
# Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
|
||||
|
@ -71,7 +71,6 @@ class Model(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
def warmup(self, batch: B) -> Optional[int]:
|
||||
logger.info("in this warmup model.py")
|
||||
self.generate_token(batch)
|
||||
return None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user