diff --git a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh deleted file mode 100644 index 29136dfc..00000000 --- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh +++ /dev/null @@ -1,38 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matmul_cuh -#define _q4_matmul_cuh - -#include -#include -#include -#include -#include - -#include "../hip_func/q4_matrix.cuh" -#include "../tuning.h" - -void q4_matmul_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - const Q4Matrix* w, - half* out, - bool no_zero = false, - hipStream_t alt_stream = NULL -); - -void q4_matmul_recons_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - Q4Matrix* w, - half* out, - const hipblasHandle_t handle, - bool no_zero = false -); - -#endif diff --git a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh deleted file mode 100644 index 7fdd66be..00000000 --- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh +++ /dev/null @@ -1,54 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matrix_cuh -#define _q4_matrix_cuh - -#include -#include -#include - -class Q4Matrix -{ -public: - - int device; - - int height; - int width; - int groups; - int groupsize; - - uint32_t* cuda_qweight = NULL; - uint32_t* cuda_qzeros = NULL; - half* cuda_scales = NULL; - uint32_t* cuda_x_map = NULL; - - Q4Matrix - ( - const int _height, - const int _width, - const int _groups, - - uint32_t* _qweight, - uint32_t* _qzeros, - half* _scales, - uint32_t* _g_idx, - - const int _device - ); - - ~Q4Matrix(); - - void reconstruct(half* out); - -private: - - void make_sequential(const uint32_t* cpu_g_idx); - -}; - -void g_q4_keep_matrix(Q4Matrix* m); -void g_q4_free_matrices(); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu index 6d9aaeda..73de5681 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu @@ -133,51 +133,13 @@ void gemm_half_q_half_cuda { // Reconstruct FP16 matrix, then cuBLAS - // if (!temp_dq) { - // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half)); - - // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); - // cudaError_t error = cudaGetLastError(); - // if (error != cudaSuccess) - // printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error)); - - // for (int i = 0; i < size_n * size_k; i++) { - // temp_dq_cpu[i] = 0.0f; - // } - - // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice); - // error = cudaGetLastError(); - // if (error != cudaSuccess) - // printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error)); - // } - - if (!temp_dq) { - temp_dq = b->temp_dq; - b->reconstruct(temp_dq); - - // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half)); - - // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); - // cudaError_t error = cudaGetLastError(); - // if (error != cudaSuccess) - // printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error)); - - // for (int i = 0; i < size_n * size_k; i++) { - // temp_dq_cpu[i] = __float2half(0.0f); - // } - - // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice); - // b->reconstruct(temp_dq); - } + if (!temp_dq) temp_dq = b->temp_dq; + b->reconstruct(temp_dq); - //temp_dq = b->temp_dq; - //b->reconstruct(temp_dq); - //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH); const half alpha = __float2half(1.0f); const half beta = clear ? __float2half(0.0f) : __float2half(1.0f); - cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp index da7b06cd..ff4e1851 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp +++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp @@ -13,10 +13,6 @@ #include "cpp/util.h" -#include -#include -using namespace std; -#include // Some decluttering macros #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) @@ -118,7 +114,7 @@ void gemm_half_q_half TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes") const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - + gemm_half_q_half_cuda ( at::cuda::getCurrentCUDABlasHandle(), diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index a73dd68e..930082cd 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -680,7 +680,6 @@ class FlashCausalLM(Model): def warmup(self, batch: FlashCausalLMBatch): torch.cuda.empty_cache() - try: cache_manager = set_cache_manager( batch.blocks, @@ -698,7 +697,6 @@ class FlashCausalLM(Model): f"You need to decrease `--max-batch-prefill-tokens`" ) from e - torch.cuda.synchronize(self.device) # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm) diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py index 79b7a373..cec9eafa 100644 --- a/server/text_generation_server/models/model.py +++ b/server/text_generation_server/models/model.py @@ -71,7 +71,6 @@ class Model(ABC): raise NotImplementedError def warmup(self, batch: B) -> Optional[int]: - logger.info("in this warmup model.py") self.generate_token(batch) return None