cleaning bis

2025-09-11 12:24:53 +00:00 · 2024-01-25 18:06:58 +00:00 · 2024-01-25 18:06:58 +00:00 · 359dd46474
commit 359dd46474
parent 2909047d2e
6 changed files with 3 additions and 140 deletions
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
@ -1,38 +0,0 @@
 // !!! This is a file automatically generated by hipify!!!
 // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 #ifndef _q4_matmul_cuh
 #define _q4_matmul_cuh
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <cstdint>
 #include <cstdio>
 #include <ATen/hip/HIPContext.h>
 #include "../hip_func/q4_matrix.cuh"
 #include "../tuning.h"
 void q4_matmul_cuda
 (
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    const Q4Matrix* w,
    half* out,
    bool no_zero = false,
    hipStream_t alt_stream = NULL
 );
 void q4_matmul_recons_cuda
 (
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    Q4Matrix* w,
    half* out,
    const hipblasHandle_t handle,
    bool no_zero = false
 );
 #endif
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
@ -1,54 +0,0 @@
 // !!! This is a file automatically generated by hipify!!!
 // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 #ifndef _q4_matrix_cuh
 #define _q4_matrix_cuh
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <cstdint>
 class Q4Matrix
 {
 public:
    int device;
    int height;
    int width;
    int groups;
    int groupsize;
    uint32_t* cuda_qweight = NULL;
    uint32_t* cuda_qzeros = NULL;
    half* cuda_scales = NULL;
    uint32_t* cuda_x_map = NULL;
    Q4Matrix
    (
        const int _height,
        const int _width,
        const int _groups,
        uint32_t* _qweight,
        uint32_t* _qzeros,
        half* _scales,
        uint32_t* _g_idx,
        const int _device
    );
    ~Q4Matrix();
    void reconstruct(half* out);
 private:
    void make_sequential(const uint32_t* cpu_g_idx);
 };
 void g_q4_keep_matrix(Q4Matrix* m);
 void g_q4_free_matrices();
 #endif
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
    {
        // Reconstruct FP16 matrix, then cuBLAS
-        // if (!temp_dq) {
+        if (!temp_dq) temp_dq = b->temp_dq;
-            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
+        b->reconstruct(temp_dq);
            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
            // cudaError_t error = cudaGetLastError();
            // if (error != cudaSuccess) 
            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
            // for (int i = 0; i < size_n * size_k; i++) {
            //     temp_dq_cpu[i] = 0.0f;
            // }
        //     cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
        //     error = cudaGetLastError();
        //     if (error != cudaSuccess) 
        //         printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
        // }
        if (!temp_dq) {
            temp_dq = b->temp_dq;
            b->reconstruct(temp_dq);
            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
            // cudaError_t error = cudaGetLastError();
            // if (error != cudaSuccess) 
            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
            // for (int i = 0; i < size_n * size_k; i++) {
            //     temp_dq_cpu[i] = __float2half(0.0f);
            // }
            // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
            // b->reconstruct(temp_dq);
        }
        //temp_dq = b->temp_dq;
        //b->reconstruct(temp_dq);
        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
        const half alpha = __float2half(1.0f);
        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
        cublasHgemm(cublas_handle,
                    CUBLAS_OP_N,
                    CUBLAS_OP_N,
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@ -13,10 +13,6 @@
 #include "cpp/util.h"
 #include <iostream>
 #include <fstream>
 using namespace std;
 #include <stdio.h>
 // Some decluttering macros
 #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -680,7 +680,6 @@ class FlashCausalLM(Model):
    def warmup(self, batch: FlashCausalLMBatch):
        torch.cuda.empty_cache()
        try:
            cache_manager = set_cache_manager(
                batch.blocks,
@ -698,7 +697,6 @@ class FlashCausalLM(Model):
                f"You need to decrease `--max-batch-prefill-tokens`"
            ) from e
        torch.cuda.synchronize(self.device)
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -71,7 +71,6 @@ class Model(ABC):
        raise NotImplementedError
    def warmup(self, batch: B) -> Optional[int]:
        logger.info("in this warmup model.py")
        self.generate_token(batch)
        return None