cleanup

2025-09-11 04:14:52 +00:00 · 2024-01-25 18:03:43 +00:00 · 2024-01-25 18:03:43 +00:00 · 2909047d2e
commit 2909047d2e
parent da002794b2
12 changed files with 66 additions and 933 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,8 @@ router/tokenizer.json
 *.hip
 server/exllamav2_kernels/exllamav2_kernels/hip/
 server/exllama_kernels/exllama_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip_func/
+*_hip.cuh
+server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
+
--- a/server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
+++ b/server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
@ -1,250 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include <torch/extension.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include "util_hip.cuh"
-#include "tuning.h"
-#include "hip_buffers.cuh"
-#include "hip_func/q4_matrix.cuh"
-#include "hip_func/q4_matmul.cuh"
-#include "hip_func/column_remap.cuh"
-
-// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
-// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of
-// exceptions, CUDA functions return with a hipError_t which we can parse and dump to the console.
-
-void check_cuda(hipError_t ret)
-{
-    switch (ret)
-    {
-        case hipSuccess:
-            break;
-
-        case cudaUnspecified:
-            printf(" **** Unspecified error\n");
-            TORCH_CHECK(false, "CUDA error");
-            break;
-
-        default:
-            printf(" **** CUDA error\n"); \
-            printf(" **** %s\n", hipGetErrorString(ret)); \
-            TORCH_CHECK(false, "CUDA error"); \
-            break;
-    }
-}
-
-// Some decluttering macros
-
-#define STRINGIFY_(__x) #__x
-#define STRINGIFY(__x) STRINGIFY_(__x)
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod))
-
-#define TORCH_CHECK_DEVICE_INDEX(__index) \
-do { \
-    TORCH_CHECK(__index >= 0, "no device index"); \
-    TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \
-} while(0)
-
-#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \
-do { \
-    TORCH_CHECK_DTYPE(__w, kInt); \
-    TORCH_CHECK_DTYPE(__w_scales, kHalf); \
-    TORCH_CHECK_DTYPE(__w_zeros, kInt); \
-    TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \
-    TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \
-    TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \
-    TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \
-} while(0)
-
-int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
-{
-    int groupsize = w.size(0) * 8 / w_zeros.size(0);
-    TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]")
-    return groupsize;
-}
-
-
-// Tuning parameters
-
-ExLlamaTuning tuningParams;
-
-void set_tuning_params
-(
-    int matmul_recons_thd,
-    bool matmul_fused_remap,
-    bool matmul_no_half2
-)
-{
-    tuningParams.matmul_recons_thd = matmul_recons_thd;
-    tuningParams.matmul_fused_remap = matmul_fused_remap;
-    tuningParams.matmul_no_half2 = matmul_no_half2;
-}
-
-
-// Release all unmanaged objects allocated by the extension
-
-void cleanup()
-{
-    cleanup_buffers_cuda();
-    g_q4_free_matrices();
-}
-
-
-// Prepare buffers for forward pass
-
-void prepare_buffers
-(
-    torch::Device device,
-    torch::Tensor temp_state,
-    torch::Tensor temp_dq
-)
-{
-    int device_index = device.index();
-    TORCH_CHECK_DEVICE_INDEX(device_index);
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device);
-
-    prepare_buffers_cuda
-    (
-        device_index,
-        (half*) temp_state.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-}
-
-
-// Create Q4Matrix, return handle
-
-uintptr_t make_q4
-(
-    torch::Tensor qweight,
-    torch::Tensor qzeros,
-    torch::Tensor scales,
-    torch::Tensor g_idx,
-    int device
-)
-{
-    TORCH_CHECK_DTYPE(qweight, kInt);
-    TORCH_CHECK_DTYPE(qzeros, kInt);
-    TORCH_CHECK_DTYPE(scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(g_idx, kInt);
-    TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8);
-    TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1);
-    TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1);
-
-    int width = qweight.size(1);
-    int height = qweight.size(0) * 8;
-    int groups = qzeros.size(0);
-
-    Q4Matrix* m = new Q4Matrix
-    (
-        height,
-        width,
-        groups,
-
-        (uint32_t*) qweight.data_ptr(),
-        (uint32_t*) qzeros.data_ptr(),
-        (half*) scales.data_ptr(),
-        g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(),
-
-        device
-    );
-
-    g_q4_keep_matrix(m);
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-
-// Matmul half @ quant -> half
-
-void q4_matmul
-(
-    torch::Tensor x,
-    uintptr_t w,
-    torch::Tensor out
-)
-{
-    Q4Matrix* wm = reinterpret_cast<Q4Matrix*> (w);
-
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(out, kHalf);
-    TORCH_CHECK_SHAPES(x, 0, out, 0, 1);
-    TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes")
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(x));
-
-    int x_height = x.size(0);
-
-    if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
-    {
-        q4_matmul_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr()
-        );
-    }
-    else
-    {
-        q4_matmul_recons_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr(),
-            at::cuda::getCurrentCUDABlasHandle()
-        );
-    }
-}
-
-
-// Remap columns in half tensor
-
-void column_remap
-(
-    torch::Tensor x,
-    torch::Tensor x_new,
-    torch::Tensor x_map
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(x_new, kHalf);
-    TORCH_CHECK_DTYPE(x_map, kInt);
-    TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1);
-
-    int height = x.size(0);
-    int width = x.size(1);
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(x));
-
-    column_remap_cuda
-    (
-        (half*) x.data_ptr(),
-        (half*) x_new.data_ptr(),
-        height,
-        width,
-        (uint32_t*) x_map.data_ptr()
-    );
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
-    m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
-    m.def("cleanup", &cleanup, "cleanup");
-    m.def("make_q4", &make_q4, "make_q4");
-    m.def("q4_matmul", &q4_matmul, "q4_matmul");
-}
--- a/server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_buffers.cuh
@ -1,53 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _cuda_buffers_cuh
-#define _cuda_buffers_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-const int CUDA_MAX_DEVICES = 16;
-
-// #ifndef _cuda_buffers_cu
-// extern __constant__ half2 q4_table[16][256];
-// #endif
-
-class CudaBuffers
-{
-public:
-    int device;
-
-    half* temp_state;           // [max_hidden_rows * intermediate_size]
-    half* temp_dq;              // size of largest quant tensor * 8
-
-    hipStream_t alt_stream_1;
-    hipStream_t alt_stream_2;
-    hipStream_t alt_stream_3;
-    hipEvent_t alt_stream_1_done;
-    hipEvent_t alt_stream_2_done;
-    hipEvent_t alt_stream_3_done;
-
-    CudaBuffers
-    (
-        int _device,
-        half* _temp_state,
-        half* _temp_dq
-    );
-    ~CudaBuffers();
-};
-
-CudaBuffers* get_buffers(const int device_index);
-
-void prepare_buffers_cuda
-(
-    int _device,
-    half* _temp_state,
-    half* _temp_dq
-);
-
-void cleanup_buffers_cuda();
-
-#endif
--- a/server/exllama_kernels/exllama_kernels/hip_func/column_remap.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_func/column_remap.cuh
@ -1,20 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _column_remap_cuh
-#define _column_remap_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-);
-
-#endif
--- a/server/exllama_kernels/exllama_kernels/matrix_hip.cuh
+++ b/server/exllama_kernels/exllama_kernels/matrix_hip.cuh
@ -1,295 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _matrix_cuh
-#define _matrix_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-
-class MatrixView_half
-{
-public:
-    const half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
-};
-
-class MatrixView_half_rw
-{
-public:
-    half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
-};
-
-class MatrixView_q4_row
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
-    }
-};
-
-class MatrixView_q4_column
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (row & 0x07) * 4;
-        return (data[row / 8 * width + column] >> shift) & 0x0f;
-    }
-
-    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
-    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
-};
-
-// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
-
-__device__ __forceinline__ half2 dot_product_8
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
-//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
-//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
-//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
-
-        half2 tmp = __hmul2(*h_ptr++, v_01);
-        tmp = __hfma2(*h_ptr++, v_23, tmp);
-        tmp = __hfma2(*h_ptr++, v_45, tmp);
-        tmp = __hfma2(*h_ptr++, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(*h_ptr++, v_0);
-        tmp = __hfma(*h_ptr++, v_1, tmp);
-        tmp = __hfma(*h_ptr++, v_2, tmp);
-        tmp = __hfma(*h_ptr++, v_3, tmp);
-        tmp = __hfma(*h_ptr++, v_4, tmp);
-        tmp = __hfma(*h_ptr++, v_5, tmp);
-        tmp = __hfma(*h_ptr++, v_6, tmp);
-        tmp = __hfma(*h_ptr++, v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
-
-__device__ __forceinline__ half2 dot_product_8_x_map
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-        half h_0 = h_ptr[*x_map_ptr++];
-        half h_1 = h_ptr[*x_map_ptr++];
-        half h_2 = h_ptr[*x_map_ptr++];
-        half h_3 = h_ptr[*x_map_ptr++];
-        half h_4 = h_ptr[*x_map_ptr++];
-        half h_5 = h_ptr[*x_map_ptr++];
-        half h_6 = h_ptr[*x_map_ptr++];
-        half h_7 = h_ptr[*x_map_ptr++];
-
-        half2 h_01 = __halves2half2(h_0, h_1);
-        half2 h_23 = __halves2half2(h_2, h_3);
-        half2 h_45 = __halves2half2(h_4, h_5);
-        half2 h_67 = __halves2half2(h_6, h_7);
-
-        half2 tmp = __hmul2(h_01, v_01);
-        tmp = __hfma2(h_23, v_23, tmp);
-        tmp = __hfma2(h_45, v_45, tmp);
-        tmp = __hfma2(h_67, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_x_map_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-#endif
--- a/server/exllama_kernels/exllama_kernels/util_hip.cuh
+++ b/server/exllama_kernels/exllama_kernels/util_hip.cuh
@ -1,34 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _util_cuh
-#define _util_cuh
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#if defined(USE_ROCM)
-#define cudaUnspecified hipErrorUnknown
-#else
-#define cudaUnspecified hipErrorApiFailureBase
-#endif
-
-// React to failure on return code != hipSuccess
-
-#define _cuda_check(fn) \
-do { \
-    {_cuda_err = fn;} \
-    if (_cuda_err != hipSuccess) goto _cuda_fail; \
-} while(false)
-
-// React to failure on return code == 0
-
-#define _alloc_check(fn) \
-do { \
-    if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
-    else _cuda_err = hipSuccess; \
-} while(false)
-
-#endif
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@ -23,10 +23,6 @@
 #include "q_gemm_kernel.cuh"
 #include "q_gemm_kernel_gptq.cuh"

-#include <iostream>
-#include <fstream>
-using namespace std;
-#include <stdio.h>
 void gemm_half_q_half_cuda_part
 (
    const half* a,
@ -42,11 +38,8 @@ void gemm_half_q_half_cuda_part
    bool mul_r_weights
 )
 {
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
    if (!b->is_gptq)
    {
-        myfile << "go in is_gptq path" << "\n";
-        myfile.flush();
        dim3 blockDim, gridDim;
        blockDim.x = EXL2_BLOCK_KN_SIZE;
        blockDim.y = 1;
@ -57,8 +50,6 @@ void gemm_half_q_half_cuda_part

        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights);

-        myfile << "launch kernel" << "\n";
-        myfile.flush();
        kernel<<<gridDim, blockDim>>>
        (
            a,
@ -119,8 +110,6 @@ void gemm_half_q_half_cuda_part
            r_weights_stride
        );
    }
-    myfile.flush();
-    myfile.close();
 }

 void gemm_half_q_half_cuda
@ -140,51 +129,55 @@ void gemm_half_q_half_cuda
    bool mul_r_weights
 )
 {
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
    if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
    {
-        freopen("/dev/tty", "w", stdout);
-        freopen("/dev/tty", "w", stderr);
-
-        std::cout << "going in cublas path" << "\n";
-
        // Reconstruct FP16 matrix, then cuBLAS

-        if (!temp_dq) temp_dq = b->temp_dq;
+        // if (!temp_dq) {
+            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
+
+            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
+            // cudaError_t error = cudaGetLastError();
+            // if (error != cudaSuccess) 
+            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
+
+            // for (int i = 0; i < size_n * size_k; i++) {
+            //     temp_dq_cpu[i] = 0.0f;
+            // }
+
+        //     cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
+        //     error = cudaGetLastError();
+        //     if (error != cudaSuccess) 
+        //         printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error));
+        // }
+
+        if (!temp_dq) {
+            temp_dq = b->temp_dq;
            b->reconstruct(temp_dq);

+            // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half));
+
+            // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); 
+            // cudaError_t error = cudaGetLastError();
+            // if (error != cudaSuccess) 
+            //     printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error));
+
+            // for (int i = 0; i < size_n * size_k; i++) {
+            //     temp_dq_cpu[i] = __float2half(0.0f);
+            // }
+
+            // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice);
+            // b->reconstruct(temp_dq);
+        }
+        
+        //temp_dq = b->temp_dq;
+        //b->reconstruct(temp_dq);
+
        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);

        const half alpha = __float2half(1.0f);
        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);

-        freopen("/dev/tty", "w", stdout);
-        freopen("/dev/tty", "w", stderr);
-
-        std:cout << "hey it's me\n" << std::flush;
-
-        //half* val = temp_dq + (size_n * size_k - 1) * sizeof(half);
-        //half* val = temp_dq + 1;
-        
-        half* my_val_host;
-
-        cudaError_t error = cudaGetLastError();
-        if (error != cudaSuccess) 
-            printf("Error before: %s\n", cudaGetErrorString(error));
-
-        cudaError_t err = cudaMemcpy(my_val_host, temp_dq, sizeof(half), cudaMemcpyDeviceToHost);
-        if (err != cudaSuccess) 
-            printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(err));
-
-        // float my_val_float = __half2float(*temp_dq);
-        //std::cout << "temp_dq: " << my_val_float << "\n" << std::flush;
-
-        std::cout << "call cublasHgemm" << "\n";
-        std::cout << "call cublasHgemm size_n" << size_n << "\n";
-        std::cout << "call cublasHgemm size_m" << size_m << "\n";
-        std::cout << "call cublasHgemm size_k" << size_k << "\n";
-        std::cout << "call cublasHgemm b width" << b->width << "\n";
-        std::cout << "call cublasHgemm b height" << b->height << "\n" << std::flush;
        cublasHgemm(cublas_handle,
                    CUBLAS_OP_N,
                    CUBLAS_OP_N,
@ -217,9 +210,6 @@ void gemm_half_q_half_cuda
    {
        // Quantized matmul

-
-        std::cout << "going in gemm_half_q_half_cuda_part path" << "\n";
-
        int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
        int max_chunks = size_m / block_m_size_max;
        int last_chunk = max_chunks * block_m_size_max;
@ -227,14 +217,11 @@ void gemm_half_q_half_cuda

        if (max_chunks)
        {
-            std::cout << "call gemm_half_q_half_cuda_part max_chunks" << "\n";
-
            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
        }

        if (last_chunk_size)
        {
-            std::cout << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n";
            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
        }
    }
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
@ -14,11 +14,6 @@
 #define THREADS_X 32
 #define THREADS_Y 32

-#include <iostream>
-#include <fstream>
-using namespace std;
-
-
 // Shuffle quantized data on load

 __global__ void shuffle_kernel
@ -92,7 +87,6 @@ QMatrix::QMatrix

    is_gptq = (_gptq_qzeros != NULL);

-    std::cout << "is_gptq in QMatrix init " << is_gptq << "\n";
    if (is_gptq)
    {
        gptq_groupsize = 1;
@ -507,8 +501,6 @@ void QMatrix::reconstruct(half* out)
    }
    else
    {
-        std::cout << "reconstructing with reconstruct_gptq_kernel" << "\n";
-
        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
        reconstruct_gptq_kernel<<<gridDim, blockDim>>>
        (
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@ -16,7 +16,7 @@
 #include <iostream>
 #include <fstream>
 using namespace std;
-
+#include <stdio.h>
 // Some decluttering macros

 #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
@ -60,11 +60,6 @@ uintptr_t make_q_matrix
    int groups;
    int height;

-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile << "in make_q_matrix" << "\n";
-    myfile.flush();
-    myfile.close();
-
    if (!q_scale.device().is_meta())
    {
        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
@ -82,11 +77,6 @@ uintptr_t make_q_matrix

    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")

-    ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
-    myfile2.flush();
-    myfile2.close();
-
    QMatrix* m = new QMatrix
    (
        device,
@ -119,14 +109,6 @@ void gemm_half_q_half
    bool force_cuda
 )
 {
-
-    //throw std::invalid_argument("a or b negative");
-
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile << "start gemm_half_q_half" << "\n";
-    myfile.flush();
-    myfile.close();
-
    QMatrix* qm = reinterpret_cast<QMatrix*> (b);

    TORCH_CHECK_DTYPE(a, kHalf);
@ -137,8 +119,6 @@ void gemm_half_q_half

    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
    
-    //myfile << "call gemm_half_q_half_cuda" << "\n";
-    //myfile.flush();
    gemm_half_q_half_cuda
    (
        at::cuda::getCurrentCUDABlasHandle(),
--- a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp
@ -1,164 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <torch/extension.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#include "config.h"
-
-#include "hip/q_matrix.cuh"
-#include "hip/q_gemm.cuh"
-
-#include "cpp/util.h"
-
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-// Some decluttering macros
-
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-
-
-// Quant matrix
-
-uintptr_t make_q_matrix
-(
-    torch::Tensor q_weight,
-    torch::Tensor q_perm,
-    torch::Tensor q_invperm,
-    torch::Tensor q_scale,
-    torch::Tensor q_scale_max,
-    torch::Tensor q_groups,
-    torch::Tensor q_group_map,
-    torch::Tensor gptq_qzeros,
-    torch::Tensor gptq_scales,
-    torch::Tensor gptq_g_idx,
-    torch::Tensor temp_dq
-)
-{
-    TORCH_CHECK_DTYPE(q_weight, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
-    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_group_map, kShort);
-    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
-    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);
-
-    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);
-
-    int device = q_weight.device().index();
-    int width = q_weight.size(1);
-    int groups;
-    int height;
-
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile << "in make_q_matrix" << "\n";
-    myfile.flush();
-    myfile.close();
-
-    if (!q_scale.device().is_meta())
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
-        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
-        groups = q_scale.size(0);
-        height = q_invperm.size(0);
-    }
-    else
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
-        groups = gptq_qzeros.size(0);
-        height = q_weight.size(0) * 8;
-    }
-
-    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
-
-    ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
-    myfile2.flush();
-    myfile2.close();
-
-    QMatrix* m = new QMatrix
-    (
-        device,
-        height,
-        width,
-        groups,
-        (uint32_t*) q_weight.data_ptr(),
-        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
-        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
-        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
-        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
-        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
-        q_group_map.device().is_meta() ? NULL : (uint16_t*) q_group_map.data_ptr(),
-        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
-        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
-        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-
-    if (m->failed) throw std::runtime_error("CUDA out of memory");
-
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-void gemm_half_q_half
-(
-    torch::Tensor a,
-    uintptr_t b,
-    torch::Tensor c,
-    bool force_cuda
-)
-{
-
-    //throw std::invalid_argument("a or b negative");
-
-    ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
-    myfile << "start gemm_half_q_half" << "\n";
-    myfile.flush();
-    myfile.close();
-
-    QMatrix* qm = reinterpret_cast<QMatrix*> (b);
-
-    TORCH_CHECK_DTYPE(a, kHalf);
-    TORCH_CHECK_DTYPE(c, kHalf);
-    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
-    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
-    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a));
-    
-    //myfile << "call gemm_half_q_half_cuda" << "\n";
-    //myfile.flush();
-    gemm_half_q_half_cuda
-    (
-        at::cuda::getCurrentCUDABlasHandle(),
-        (const half*) a.data_ptr(),
-        qm,
-        (half*) c.data_ptr(),
-        c.size(0), // m
-        c.size(1), // n
-        a.size(1), // k
-        true,
-        NULL,
-        force_cuda
-    );
-}
-
-// Bindings
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
-    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
-}
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -31,7 +31,6 @@ from text_generation_server.utils.dist import MEMORY_FRACTION

 tracer = trace.get_tracer(__name__)

-from loguru import logger

@dataclass
 class FlashCausalLMBatch(Batch):
@ -680,12 +679,9 @@ class FlashCausalLM(Model):
        return FlashCausalLMBatch

    def warmup(self, batch: FlashCausalLMBatch):
-        logger.info("in this warmup start")
-
        torch.cuda.empty_cache()
-        logger.info("in this warmup after empty cache")

-        #try:
+        try:
            cache_manager = set_cache_manager(
                batch.blocks,
                self.num_layers,
@ -695,20 +691,16 @@ class FlashCausalLM(Model):
                self.dtype,
                self.device,
            )
-        logger.info("in this warmup after set_cache_manager")
            _, batch, _ = self.generate_token(batch)
-        logger.info("in this warmup after generate_token")
-        # except torch.cuda.OutOfMemoryError as e:
-        #     raise RuntimeError(
-        #         f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
-        #         f"You need to decrease `--max-batch-prefill-tokens`"
-        #     ) from e
+        except torch.cuda.OutOfMemoryError as e:
+            raise RuntimeError(
+                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            ) from e


        torch.cuda.synchronize(self.device)

-        logger.info("in this warmup after sync")
-
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the free memory
        dtype_size = torch.tensor([], dtype=self.dtype).element_size()
@ -828,14 +820,11 @@ class FlashCausalLM(Model):
            batch.block_tables_tensor = block_tables_tensor
            batch.slots = slots

-        logger.info("callign forward in generate_token")
+        try:
            out = self.forward(batch)
-        # try:
-        #     out = self.forward(batch)
-        # except Exception as e:
-        #     del batch
-        #     raise e
-        logger.info("finished forward in generate_token")
+        except Exception as e:
+            del batch
+            raise e

        if isinstance(out, tuple):
            out, speculative_logits = out
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -63,7 +63,6 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())

    async def Warmup(self, request, context):
-        logger.info("IN WARMUP")
        if self.quantize == "gptq":
            try:
                # When using GPTQ, Exllama kernels need some global kernels
@ -79,7 +78,6 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            except ImportError:
                pass

-        logger.info("after quantize == gptq")
        if (
            self.model.batch_type == IdeficsCausalLMBatch
        ):  # Hack, i would rather use kwargs in the `from_pb` call
@ -94,10 +92,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )
-        logger.info("calling model.warmup")
        max_supported_total_tokens = self.model.warmup(batch)

-        logger.info("end warmup")
        return generate_pb2.WarmupResponse(
            max_supported_total_tokens=max_supported_total_tokens
        )