cleaning bis

2025-09-11 12:24:53 +00:00 · 2024-01-25 18:06:58 +00:00 · 2024-01-25 18:06:58 +00:00 · 359dd46474
commit 359dd46474
parent 2909047d2e
6 changed files with 3 additions and 140 deletions
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
@ -1,38 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/hip/HIPContext.h>
-
-#include "../hip_func/q4_matrix.cuh"
-#include "../tuning.h"
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    hipStream_t alt_stream = NULL
-);
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const hipblasHandle_t handle,
-    bool no_zero = false
-);
-
-#endif
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
@ -1,54 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matrix_cuh
-#define _q4_matrix_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-
-class Q4Matrix
-{
-public:
-
-    int device;
-
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-
-    uint32_t* cuda_qweight = NULL;
-    uint32_t* cuda_qzeros = NULL;
-    half* cuda_scales = NULL;
-    uint32_t* cuda_x_map = NULL;
-
-    Q4Matrix
-    (
-        const int _height,
-        const int _width,
-        const int _groups,
-
-        uint32_t* _qweight,
-        uint32_t* _qzeros,
-        half* _scales,
-        uint32_t* _g_idx,
-
-        const int _device
-    );
-
-    ~Q4Matrix();
-
-    void reconstruct(half* out);
-
-private:
-
-    void make_sequential(const uint32_t* cpu_g_idx);
-
-};
-
-void g_q4_keep_matrix(Q4Matrix* m);
-void g_q4_free_matrices();
-
-#endif
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
    {
        // Reconstruct FP16 matrix, then cuBLAS

-        // if (!temp_dq) {
-            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
-
-            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
-            // cudaError_t error = cudaGetLastError();
-            // if (error != cudaSuccess) 
-            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
-
-            // for (int i = 0; i < size_n * size_k; i++) {
-            //     temp_dq_cpu[i] = 0.0f;
-            // }
-
-        //     cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
-        //     error = cudaGetLastError();
-        //     if (error != cudaSuccess) 
-        //         printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
-        // }
-
-        if (!temp_dq) {
-            temp_dq = b->temp_dq;
-            b->reconstruct(temp_dq);
-
-            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
-
-            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
-            // cudaError_t error = cudaGetLastError();
-            // if (error != cudaSuccess) 
-            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
-
-            // for (int i = 0; i < size_n * size_k; i++) {
-            //     temp_dq_cpu[i] = __float2half(0.0f);
-            // }
-
-            // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
-            // b->reconstruct(temp_dq);
-        }
+        if (!temp_dq) temp_dq = b->temp_dq;
+        b->reconstruct(temp_dq);
        
-        //temp_dq = b->temp_dq;
-        //b->reconstruct(temp_dq);
-
        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);

        const half alpha = __float2half(1.0f);
        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
-
        cublasHgemm(cublas_handle,
                    CUBLAS_OP_N,
                    CUBLAS_OP_N,
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@ -13,10 +13,6 @@

 #include "cpp/util.h"

-#include <iostream>
-#include <fstream>
-using namespace std;
-#include <stdio.h>
 // Some decluttering macros

 #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
@ -118,7 +114,7 @@ void gemm_half_q_half
    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")

    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-    
+
    gemm_half_q_half_cuda
    (
        at::cuda::getCurrentCUDABlasHandle(),
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -680,7 +680,6 @@ class FlashCausalLM(Model):

    def warmup(self, batch: FlashCausalLMBatch):
        torch.cuda.empty_cache()
-
        try:
            cache_manager = set_cache_manager(
                batch.blocks,
@ -698,7 +697,6 @@ class FlashCausalLM(Model):
                f"You need to decrease `--max-batch-prefill-tokens`"
            ) from e

-
        torch.cuda.synchronize(self.device)

        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -71,7 +71,6 @@ class Model(ABC):
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
-        logger.info("in this warmup model.py")
        self.generate_token(batch)
        return None