diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh
deleted file mode 100644
index 19b1e4a6..00000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _compat_gemm_cuh
-#define _compat_gemm_cuh
-
-#if defined(USE_ROCM)
-
-// For some reason this include is not present anywhere in exllama_v2 codebase, but it is required
-// for symbols as hipblasHalf.
-#include <hipblas/hipblas.h>
-
-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
-                                                               hipblasOperation_t transA,
-                                                               hipblasOperation_t transB,
-                                                               int                m,
-                                                               int                n,
-                                                               int                k,
-                                                               const half*        alpha,
-                                                               const half*        AP,
-                                                               int                lda,
-                                                               const half*        BP,
-                                                               int                ldb,
-                                                               const half*        beta,
-                                                               half*              CP,
-                                                               int                ldc) {
-    return hipblasHgemm(handle, transA, transB, m, n, k,
-                        reinterpret_cast<const hipblasHalf *>(alpha),
-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
-                        reinterpret_cast<const hipblasHalf *>(beta),
-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
-}
-#define hipblasHgemm __compat_hipblasHgemm
-
-// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
-#define rocblas_operation_none HIPBLAS_OP_N
-#define rocblas_hgemm __compat_hipblasHgemm
-#endif
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
index d73ce292..746a2a90 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@@ -22,12 +22,11 @@
 
 #include "q_gemm_kernel.cuh"
 #include "q_gemm_kernel_gptq.cuh"
-#include "compat_gemm.cuh"
 
 #include <iostream>
 #include <fstream>
 using namespace std;
-
+#include <stdio.h>
 void gemm_half_q_half_cuda_part
 (
     const half* a,
@@ -43,7 +42,7 @@ void gemm_half_q_half_cuda_part
     bool mul_r_weights
 )
 {
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt");
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
     if (!b->is_gptq)
     {
         myfile << "go in is_gptq path" << "\n";
@@ -141,11 +140,14 @@ void gemm_half_q_half_cuda
     bool mul_r_weights
 )
 {
-    ofstream myfile;
-    myfile.open ("/tgi/server/exllamav2_kernels/log.txt");
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
     if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
     {
-        myfile << "going in cublas path" << "\n";
+        freopen("/dev/tty", "w", stdout);
+        freopen("/dev/tty", "w", stderr);
+
+        std::cout << "going in cublas path" << "\n";
+
         // Reconstruct FP16 matrix, then cuBLAS
 
         if (!temp_dq) temp_dq = b->temp_dq;
@@ -155,6 +157,34 @@ void gemm_half_q_half_cuda
 
         const half alpha = __float2half(1.0f);
         const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
+
+        freopen("/dev/tty", "w", stdout);
+        freopen("/dev/tty", "w", stderr);
+
+        std:cout << "hey it's me\n" << std::flush;
+
+        //half* val = temp_dq + (size_n * size_k - 1) * sizeof(half);
+        //half* val = temp_dq + 1;
+        
+        // half* my_val_host;
+
+        cudaError_t error = cudaGetLastError();
+        if (error != cudaSuccess) 
+            printf("Error before: %s\n", cudaGetErrorString(error));
+
+        cudaError_t err = cudaMemcpy(my_val_host, temp_dq, sizeof(half), cudaMemcpyDeviceToHost);
+        if (err != cudaSuccess) 
+            printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(err));
+
+        // float my_val_float = __half2float(*temp_dq);
+        //std::cout << "temp_dq: " << my_val_float << "\n" << std::flush;
+
+        std::cout << "call cublasHgemm" << "\n";
+        std::cout << "call cublasHgemm size_n" << size_n << "\n";
+        std::cout << "call cublasHgemm size_m" << size_m << "\n";
+        std::cout << "call cublasHgemm size_k" << size_k << "\n";
+        std::cout << "call cublasHgemm b width" << b->width << "\n";
+        std::cout << "call cublasHgemm b height" << b->height << "\n" << std::flush;
         cublasHgemm(cublas_handle,
                     CUBLAS_OP_N,
                     CUBLAS_OP_N,
@@ -187,8 +217,8 @@ void gemm_half_q_half_cuda
     {
         // Quantized matmul
 
-        myfile << "going in gemm_half_q_half_cuda_part path" << "\n";
 
+        std::cout << "going in gemm_half_q_half_cuda_part path" << "\n";
 
         int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
         int max_chunks = size_m / block_m_size_max;
@@ -197,17 +227,17 @@ void gemm_half_q_half_cuda
 
         if (max_chunks)
         {
-            myfile << "call gemm_half_q_half_cuda_part max_chunks" << "\n";
+            std::cout << "call gemm_half_q_half_cuda_part max_chunks" << "\n";
+
             gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
         }
 
         if (last_chunk_size)
         {
-            myfile << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n";
+            std::cout << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n";
             gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
         }
     }
-    myfile.close();
 }
 
 __global__ void clear_kernel
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
index ae08cc1f..b3e4bf68 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
@@ -14,6 +14,11 @@
 #define THREADS_X 32
 #define THREADS_Y 32
 
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+
 // Shuffle quantized data on load
 
 __global__ void shuffle_kernel
@@ -87,6 +92,7 @@ QMatrix::QMatrix
 
     is_gptq = (_gptq_qzeros != NULL);
 
+    std::cout << "is_gptq in QMatrix init " << is_gptq << "\n";
     if (is_gptq)
     {
         gptq_groupsize = 1;
@@ -501,6 +507,8 @@ void QMatrix::reconstruct(half* out)
     }
     else
     {
+        std::cout << "reconstructing with reconstruct_gptq_kernel" << "\n";
+
         gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
         reconstruct_gptq_kernel<<<gridDim, blockDim>>>
         (
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
index 3a4a4e14..17ed53ce 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@@ -60,6 +60,11 @@ uintptr_t make_q_matrix
     int groups;
     int height;
 
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
+    myfile << "in make_q_matrix" << "\n";
+    myfile.flush();
+    myfile.close();
+
     if (!q_scale.device().is_meta())
     {
         TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
@@ -77,6 +82,11 @@ uintptr_t make_q_matrix
 
     TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
 
+    ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
+    myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
+    myfile2.flush();
+    myfile2.close();
+
     QMatrix* m = new QMatrix
     (
         device,
@@ -109,9 +119,13 @@ void gemm_half_q_half
     bool force_cuda
 )
 {
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt");
+
+    //throw std::invalid_argument("a or b negative");
+
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
     myfile << "start gemm_half_q_half" << "\n";
     myfile.flush();
+    myfile.close();
 
     QMatrix* qm = reinterpret_cast<QMatrix*> (b);
 
@@ -123,8 +137,8 @@ void gemm_half_q_half
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
     
-    myfile << "call gemm_half_q_half_cuda" << "\n";
-    myfile.flush();
+    //myfile << "call gemm_half_q_half_cuda" << "\n";
+    //myfile.flush();
     gemm_half_q_half_cuda
     (
         at::cuda::getCurrentCUDABlasHandle(),
@@ -138,7 +152,6 @@ void gemm_half_q_half
         NULL,
         force_cuda
     );
-    myfile.close();
 }
 
 // Bindings
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
index 3a23fc18..45f87a7b 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
@@ -61,6 +61,11 @@ uintptr_t make_q_matrix
     int groups;
     int height;
 
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
+    myfile << "in make_q_matrix" << "\n";
+    myfile.flush();
+    myfile.close();
+
     if (!q_scale.device().is_meta())
     {
         TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
@@ -78,6 +83,11 @@ uintptr_t make_q_matrix
 
     TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
 
+    ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
+    myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
+    myfile2.flush();
+    myfile2.close();
+
     QMatrix* m = new QMatrix
     (
         device,
@@ -110,9 +120,13 @@ void gemm_half_q_half
     bool force_cuda
 )
 {
-    ofstream myfile;
-    myfile.open ("/tgi/server/exllamav2_kernels/log.txt");
+
+    //throw std::invalid_argument("a or b negative");
+
+    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
     myfile << "start gemm_half_q_half" << "\n";
+    myfile.flush();
+    myfile.close();
 
     QMatrix* qm = reinterpret_cast<QMatrix*> (b);
 
@@ -124,7 +138,8 @@ void gemm_half_q_half
 
     const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a));
     
-    myfile << "call gemm_half_q_half_cuda" << "\n";
+    //myfile << "call gemm_half_q_half_cuda" << "\n";
+    //myfile.flush();
     gemm_half_q_half_cuda
     (
         at::cuda::getCurrentCUDABlasHandle(),
@@ -138,7 +153,6 @@ void gemm_half_q_half
         NULL,
         force_cuda
     );
-    myfile.close();
 }
 
 // Bindings
diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py
index 518db1df..4a16b546 100644
--- a/server/exllamav2_kernels/setup.py
+++ b/server/exllamav2_kernels/setup.py
@@ -1,5 +1,15 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+
+extra_cuda_cflags = ["-lineinfo", "-O3"]
+
+if torch.version.hip:
+    extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF"]
+
+extra_compile_args = {
+    "nvcc": extra_cuda_cflags,
+}
 
 setup(
     name="exllamav2_kernels",
@@ -11,6 +21,7 @@ setup(
                 "exllamav2_kernels/cuda/q_matrix.cu",
                 "exllamav2_kernels/cuda/q_gemm.cu",
             ],
+            extra_compile_args=extra_compile_args,
         )
     ],
     cmdclass={"build_ext": BuildExtension},