diff --git a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
deleted file mode 100644
index 29136dfc..00000000
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matmul.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/hip/HIPContext.h>
-
-#include "../hip_func/q4_matrix.cuh"
-#include "../tuning.h"
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    hipStream_t alt_stream = NULL
-);
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const hipblasHandle_t handle,
-    bool no_zero = false
-);
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
deleted file mode 100644
index 7fdd66be..00000000
--- a/server/exllama_kernels/exllama_kernels/hip_func/q4_matrix.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matrix_cuh
-#define _q4_matrix_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-
-class Q4Matrix
-{
-public:
-
-    int device;
-
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-
-    uint32_t* cuda_qweight = NULL;
-    uint32_t* cuda_qzeros = NULL;
-    half* cuda_scales = NULL;
-    uint32_t* cuda_x_map = NULL;
-
-    Q4Matrix
-    (
-        const int _height,
-        const int _width,
-        const int _groups,
-
-        uint32_t* _qweight,
-        uint32_t* _qzeros,
-        half* _scales,
-        uint32_t* _g_idx,
-
-        const int _device
-    );
-
-    ~Q4Matrix();
-
-    void reconstruct(half* out);
-
-private:
-
-    void make_sequential(const uint32_t* cpu_g_idx);
-
-};
-
-void g_q4_keep_matrix(Q4Matrix* m);
-void g_q4_free_matrices();
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
index 6d9aaeda..73de5681 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@@ -133,51 +133,13 @@ void gemm_half_q_half_cuda
     {
         // Reconstruct FP16 matrix, then cuBLAS
 
-        // if (!temp_dq) {
-            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
-
-            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
-            // cudaError_t error = cudaGetLastError();
-            // if (error != cudaSuccess) 
-            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
-
-            // for (int i = 0; i < size_n * size_k; i++) {
-            //     temp_dq_cpu[i] = 0.0f;
-            // }
-
-        //     cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
-        //     error = cudaGetLastError();
-        //     if (error != cudaSuccess) 
-        //         printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
-        // }
-
-        if (!temp_dq) {
-            temp_dq = b->temp_dq;
-            b->reconstruct(temp_dq);
-
-            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
-
-            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
-            // cudaError_t error = cudaGetLastError();
-            // if (error != cudaSuccess) 
-            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
-
-            // for (int i = 0; i < size_n * size_k; i++) {
-            //     temp_dq_cpu[i] = __float2half(0.0f);
-            // }
-
-            // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
-            // b->reconstruct(temp_dq);
-        }
+        if (!temp_dq) temp_dq = b->temp_dq;
+        b->reconstruct(temp_dq);
         
-        //temp_dq = b->temp_dq;
-        //b->reconstruct(temp_dq);
-
         //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
 
         const half alpha = __float2half(1.0f);
         const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
-
         cublasHgemm(cublas_handle,
                     CUBLAS_OP_N,
                     CUBLAS_OP_N,
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
index da7b06cd..ff4e1851 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@@ -13,10 +13,6 @@
 
 #include "cpp/util.h"
 
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include <stdio.h>
 // Some decluttering macros
 
 #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
@@ -118,7 +114,7 @@ void gemm_half_q_half
     TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-    
+
     gemm_half_q_half_cuda
     (
         at::cuda::getCurrentCUDABlasHandle(),
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index a73dd68e..930082cd 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -680,7 +680,6 @@ class FlashCausalLM(Model):
 
     def warmup(self, batch: FlashCausalLMBatch):
         torch.cuda.empty_cache()
-
         try:
             cache_manager = set_cache_manager(
                 batch.blocks,
@@ -698,7 +697,6 @@ class FlashCausalLM(Model):
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
-
         torch.cuda.synchronize(self.device)
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 79b7a373..cec9eafa 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -71,7 +71,6 @@ class Model(ABC):
         raise NotImplementedError
 
     def warmup(self, batch: B) -> Optional[int]:
-        logger.info("in this warmup model.py")
         self.generate_token(batch)
         return None