diff --git a/.gitignore b/.gitignore index 5095e9ce..1f9ba162 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,8 @@ router/tokenizer.json *.hip server/exllamav2_kernels/exllamav2_kernels/hip/ server/exllama_kernels/exllama_kernels/hip/ +server/exllama_kernels/exllama_kernels/hip_func/ +*_hip.cuh +server/exllama_kernels/exllama_kernels/hip_buffers.cuh +server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp + diff --git a/server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp b/server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp deleted file mode 100644 index 92ba4575..00000000 --- a/server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp +++ /dev/null @@ -1,250 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include -#include -#include -#include -#include -#include -#include -#include "util_hip.cuh" -#include "tuning.h" -#include "hip_buffers.cuh" -#include "hip_func/q4_matrix.cuh" -#include "hip_func/q4_matmul.cuh" -#include "hip_func/column_remap.cuh" - -// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a -// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of -// exceptions, CUDA functions return with a hipError_t which we can parse and dump to the console. - -void check_cuda(hipError_t ret) -{ - switch (ret) - { - case hipSuccess: - break; - - case cudaUnspecified: - printf(" **** Unspecified error\n"); - TORCH_CHECK(false, "CUDA error"); - break; - - default: - printf(" **** CUDA error\n"); \ - printf(" **** %s\n", hipGetErrorString(ret)); \ - TORCH_CHECK(false, "CUDA error"); \ - break; - } -} - -// Some decluttering macros - -#define STRINGIFY_(__x) #__x -#define STRINGIFY(__x) STRINGIFY_(__x) -#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod)) - -#define TORCH_CHECK_DEVICE_INDEX(__index) \ -do { \ - TORCH_CHECK(__index >= 0, "no device index"); \ - TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \ -} while(0) - -#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \ -do { \ - TORCH_CHECK_DTYPE(__w, kInt); \ - TORCH_CHECK_DTYPE(__w_scales, kHalf); \ - TORCH_CHECK_DTYPE(__w_zeros, kInt); \ - TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \ - TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \ - TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \ - TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \ -} while(0) - -int get_groupsize(torch::Tensor w, torch::Tensor w_zeros) -{ - int groupsize = w.size(0) * 8 / w_zeros.size(0); - TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]") - return groupsize; -} - - -// Tuning parameters - -ExLlamaTuning tuningParams; - -void set_tuning_params -( - int matmul_recons_thd, - bool matmul_fused_remap, - bool matmul_no_half2 -) -{ - tuningParams.matmul_recons_thd = matmul_recons_thd; - tuningParams.matmul_fused_remap = matmul_fused_remap; - tuningParams.matmul_no_half2 = matmul_no_half2; -} - - -// Release all unmanaged objects allocated by the extension - -void cleanup() -{ - cleanup_buffers_cuda(); - g_q4_free_matrices(); -} - - -// Prepare buffers for forward pass - -void prepare_buffers -( - torch::Device device, - torch::Tensor temp_state, - torch::Tensor temp_dq -) -{ - int device_index = device.index(); - TORCH_CHECK_DEVICE_INDEX(device_index); - const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device); - - prepare_buffers_cuda - ( - device_index, - (half*) temp_state.data_ptr(), - (half*) temp_dq.data_ptr() - ); -} - - -// Create Q4Matrix, return handle - -uintptr_t make_q4 -( - torch::Tensor qweight, - torch::Tensor qzeros, - torch::Tensor scales, - torch::Tensor g_idx, - int device -) -{ - TORCH_CHECK_DTYPE(qweight, kInt); - TORCH_CHECK_DTYPE(qzeros, kInt); - TORCH_CHECK_DTYPE(scales, kHalf); - TORCH_CHECK_DTYPE_OPT(g_idx, kInt); - TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8); - TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1); - TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1); - - int width = qweight.size(1); - int height = qweight.size(0) * 8; - int groups = qzeros.size(0); - - Q4Matrix* m = new Q4Matrix - ( - height, - width, - groups, - - (uint32_t*) qweight.data_ptr(), - (uint32_t*) qzeros.data_ptr(), - (half*) scales.data_ptr(), - g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(), - - device - ); - - g_q4_keep_matrix(m); - return reinterpret_cast (m); -} - - -// Matmul half @ quant -> half - -void q4_matmul -( - torch::Tensor x, - uintptr_t w, - torch::Tensor out -) -{ - Q4Matrix* wm = reinterpret_cast (w); - - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(out, kHalf); - TORCH_CHECK_SHAPES(x, 0, out, 0, 1); - TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes") - - const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(x)); - - int x_height = x.size(0); - - if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd) - { - q4_matmul_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr() - ); - } - else - { - q4_matmul_recons_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr(), - at::cuda::getCurrentCUDABlasHandle() - ); - } -} - - -// Remap columns in half tensor - -void column_remap -( - torch::Tensor x, - torch::Tensor x_new, - torch::Tensor x_map -) -{ - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(x_new, kHalf); - TORCH_CHECK_DTYPE(x_map, kInt); - TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1); - - int height = x.size(0); - int width = x.size(1); - - const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(x)); - - column_remap_cuda - ( - (half*) x.data_ptr(), - (half*) x_new.data_ptr(), - height, - width, - (uint32_t*) x_map.data_ptr() - ); -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("set_tuning_params", &set_tuning_params, "set_tuning_params"); - m.def("prepare_buffers", &prepare_buffers, "prepare_buffers"); - m.def("cleanup", &cleanup, "cleanup"); - m.def("make_q4", &make_q4, "make_q4"); - m.def("q4_matmul", &q4_matmul, "q4_matmul"); -} diff --git a/server/exllama_kernels/exllama_kernels/hip_buffers.cuh b/server/exllama_kernels/exllama_kernels/hip_buffers.cuh deleted file mode 100644 index 008002cb..00000000 --- a/server/exllama_kernels/exllama_kernels/hip_buffers.cuh +++ /dev/null @@ -1,53 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _cuda_buffers_cuh -#define _cuda_buffers_cuh - -#include -#include -#include -#include - -const int CUDA_MAX_DEVICES = 16; - -// #ifndef _cuda_buffers_cu -// extern __constant__ half2 q4_table[16][256]; -// #endif - -class CudaBuffers -{ -public: - int device; - - half* temp_state; // [max_hidden_rows * intermediate_size] - half* temp_dq; // size of largest quant tensor * 8 - - hipStream_t alt_stream_1; - hipStream_t alt_stream_2; - hipStream_t alt_stream_3; - hipEvent_t alt_stream_1_done; - hipEvent_t alt_stream_2_done; - hipEvent_t alt_stream_3_done; - - CudaBuffers - ( - int _device, - half* _temp_state, - half* _temp_dq - ); - ~CudaBuffers(); -}; - -CudaBuffers* get_buffers(const int device_index); - -void prepare_buffers_cuda -( - int _device, - half* _temp_state, - half* _temp_dq -); - -void cleanup_buffers_cuda(); - -#endif diff --git a/server/exllama_kernels/exllama_kernels/hip_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/hip_func/column_remap.cuh deleted file mode 100644 index 7c840b2e..00000000 --- a/server/exllama_kernels/exllama_kernels/hip_func/column_remap.cuh +++ /dev/null @@ -1,20 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _column_remap_cuh -#define _column_remap_cuh - -#include -#include -#include - -void column_remap_cuda -( - const half* x, - half* x_new, - const int x_height, - const int x_width, - const uint32_t* x_map -); - -#endif \ No newline at end of file diff --git a/server/exllama_kernels/exllama_kernels/matrix_hip.cuh b/server/exllama_kernels/exllama_kernels/matrix_hip.cuh deleted file mode 100644 index 78eb0d4b..00000000 --- a/server/exllama_kernels/exllama_kernels/matrix_hip.cuh +++ /dev/null @@ -1,295 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _matrix_cuh -#define _matrix_cuh - -#include -#include - -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } -}; - -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } -}; - -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } -}; - -class MatrixView_q4_column -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (row & 0x07) * 4; - return (data[row / 8 * width + column] >> shift) & 0x0f; - } - - __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } - __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } -}; - -// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale - -__device__ __forceinline__ half2 dot_product_8 -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - -// half2 v_01 = q4_table[v_zero - 1][(v_read ) & 0xff]; // (constant memory is too slow apparently) -// half2 v_23 = q4_table[v_zero - 1][(v_read >> 8) & 0xff]; -// half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff]; -// half2 v_67 = q4_table[v_zero - 1][(v_read >> 24) ]; - - half2 tmp = __hmul2(*h_ptr++, v_01); - tmp = __hfma2(*h_ptr++, v_23, tmp); - tmp = __hfma2(*h_ptr++, v_45, tmp); - tmp = __hfma2(*h_ptr++, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half* h_ptr = h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(*h_ptr++, v_0); - tmp = __hfma(*h_ptr++, v_1, tmp); - tmp = __hfma(*h_ptr++, v_2, tmp); - tmp = __hfma(*h_ptr++, v_3, tmp); - tmp = __hfma(*h_ptr++, v_4, tmp); - tmp = __hfma(*h_ptr++, v_5, tmp); - tmp = __hfma(*h_ptr++, v_6, tmp); - tmp = __hfma(*h_ptr++, v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map - -__device__ __forceinline__ half2 dot_product_8_x_map -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - - half h_0 = h_ptr[*x_map_ptr++]; - half h_1 = h_ptr[*x_map_ptr++]; - half h_2 = h_ptr[*x_map_ptr++]; - half h_3 = h_ptr[*x_map_ptr++]; - half h_4 = h_ptr[*x_map_ptr++]; - half h_5 = h_ptr[*x_map_ptr++]; - half h_6 = h_ptr[*x_map_ptr++]; - half h_7 = h_ptr[*x_map_ptr++]; - - half2 h_01 = __halves2half2(h_0, h_1); - half2 h_23 = __halves2half2(h_2, h_3); - half2 h_45 = __halves2half2(h_4, h_5); - half2 h_67 = __halves2half2(h_6, h_7); - - half2 tmp = __hmul2(h_01, v_01); - tmp = __hfma2(h_23, v_23, tmp); - tmp = __hfma2(h_45, v_45, tmp); - tmp = __hfma2(h_67, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_x_map_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(h_ptr[*x_map_ptr++], v_0); - tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -#endif diff --git a/server/exllama_kernels/exllama_kernels/util_hip.cuh b/server/exllama_kernels/exllama_kernels/util_hip.cuh deleted file mode 100644 index 955aedbb..00000000 --- a/server/exllama_kernels/exllama_kernels/util_hip.cuh +++ /dev/null @@ -1,34 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _util_cuh -#define _util_cuh - -#include -#include -#include -#include - -#if defined(USE_ROCM) -#define cudaUnspecified hipErrorUnknown -#else -#define cudaUnspecified hipErrorApiFailureBase -#endif - -// React to failure on return code != hipSuccess - -#define _cuda_check(fn) \ -do { \ - {_cuda_err = fn;} \ - if (_cuda_err != hipSuccess) goto _cuda_fail; \ -} while(false) - -// React to failure on return code == 0 - -#define _alloc_check(fn) \ -do { \ - if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \ - else _cuda_err = hipSuccess; \ -} while(false) - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu index e9eea6f0..6d9aaeda 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu @@ -23,10 +23,6 @@ #include "q_gemm_kernel.cuh" #include "q_gemm_kernel_gptq.cuh" -#include -#include -using namespace std; -#include void gemm_half_q_half_cuda_part ( const half* a, @@ -42,11 +38,8 @@ void gemm_half_q_half_cuda_part bool mul_r_weights ) { - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); if (!b->is_gptq) { - myfile << "go in is_gptq path" << "\n"; - myfile.flush(); dim3 blockDim, gridDim; blockDim.x = EXL2_BLOCK_KN_SIZE; blockDim.y = 1; @@ -57,8 +50,6 @@ void gemm_half_q_half_cuda_part fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights); - myfile << "launch kernel" << "\n"; - myfile.flush(); kernel<<>> ( a, @@ -119,8 +110,6 @@ void gemm_half_q_half_cuda_part r_weights_stride ); } - myfile.flush(); - myfile.close(); } void gemm_half_q_half_cuda @@ -140,51 +129,55 @@ void gemm_half_q_half_cuda bool mul_r_weights ) { - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); if (size_m > MAX_Q_GEMM_ROWS && !force_cuda) { - freopen("/dev/tty", "w", stdout); - freopen("/dev/tty", "w", stderr); - - std::cout << "going in cublas path" << "\n"; - // Reconstruct FP16 matrix, then cuBLAS - if (!temp_dq) temp_dq = b->temp_dq; - b->reconstruct(temp_dq); + // if (!temp_dq) { + // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half)); + + // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); + // cudaError_t error = cudaGetLastError(); + // if (error != cudaSuccess) + // printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error)); + + // for (int i = 0; i < size_n * size_k; i++) { + // temp_dq_cpu[i] = 0.0f; + // } + + // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice); + // error = cudaGetLastError(); + // if (error != cudaSuccess) + // printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(error)); + // } + + if (!temp_dq) { + temp_dq = b->temp_dq; + b->reconstruct(temp_dq); + + // half* temp_dq_cpu = (half*)malloc(size_n * size_k * sizeof(half)); + + // cudaMalloc(&temp_dq, size_n * size_k * sizeof(half)); + // cudaError_t error = cudaGetLastError(); + // if (error != cudaSuccess) + // printf("Error in cudaMalloc: %s\n", cudaGetErrorString(error)); + + // for (int i = 0; i < size_n * size_k; i++) { + // temp_dq_cpu[i] = __float2half(0.0f); + // } + + // cudaMemcpy(temp_dq, temp_dq_cpu, size_n * size_k * sizeof(half), cudaMemcpyHostToDevice); + // b->reconstruct(temp_dq); + } + + //temp_dq = b->temp_dq; + //b->reconstruct(temp_dq); //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH); const half alpha = __float2half(1.0f); const half beta = clear ? __float2half(0.0f) : __float2half(1.0f); - freopen("/dev/tty", "w", stdout); - freopen("/dev/tty", "w", stderr); - - std:cout << "hey it's me\n" << std::flush; - - //half* val = temp_dq + (size_n * size_k - 1) * sizeof(half); - //half* val = temp_dq + 1; - - half* my_val_host; - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) - printf("Error before: %s\n", cudaGetErrorString(error)); - - cudaError_t err = cudaMemcpy(my_val_host, temp_dq, sizeof(half), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) - printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(err)); - - // float my_val_float = __half2float(*temp_dq); - //std::cout << "temp_dq: " << my_val_float << "\n" << std::flush; - - std::cout << "call cublasHgemm" << "\n"; - std::cout << "call cublasHgemm size_n" << size_n << "\n"; - std::cout << "call cublasHgemm size_m" << size_m << "\n"; - std::cout << "call cublasHgemm size_k" << size_k << "\n"; - std::cout << "call cublasHgemm b width" << b->width << "\n"; - std::cout << "call cublasHgemm b height" << b->height << "\n" << std::flush; cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, @@ -217,9 +210,6 @@ void gemm_half_q_half_cuda { // Quantized matmul - - std::cout << "going in gemm_half_q_half_cuda_part path" << "\n"; - int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX; int max_chunks = size_m / block_m_size_max; int last_chunk = max_chunks * block_m_size_max; @@ -227,14 +217,11 @@ void gemm_half_q_half_cuda if (max_chunks) { - std::cout << "call gemm_half_q_half_cuda_part max_chunks" << "\n"; - gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights); } if (last_chunk_size) { - std::cout << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n"; gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights); } } diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu index b3e4bf68..ae08cc1f 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu @@ -14,11 +14,6 @@ #define THREADS_X 32 #define THREADS_Y 32 -#include -#include -using namespace std; - - // Shuffle quantized data on load __global__ void shuffle_kernel @@ -92,7 +87,6 @@ QMatrix::QMatrix is_gptq = (_gptq_qzeros != NULL); - std::cout << "is_gptq in QMatrix init " << is_gptq << "\n"; if (is_gptq) { gptq_groupsize = 1; @@ -507,8 +501,6 @@ void QMatrix::reconstruct(half* out) } else { - std::cout << "reconstructing with reconstruct_gptq_kernel" << "\n"; - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4); reconstruct_gptq_kernel<<>> ( diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp index 17ed53ce..da7b06cd 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp +++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp @@ -16,7 +16,7 @@ #include #include using namespace std; - +#include // Some decluttering macros #define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) @@ -60,11 +60,6 @@ uintptr_t make_q_matrix int groups; int height; - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile << "in make_q_matrix" << "\n"; - myfile.flush(); - myfile.close(); - if (!q_scale.device().is_meta()) { TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8); @@ -82,11 +77,6 @@ uintptr_t make_q_matrix TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") - ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n"; - myfile2.flush(); - myfile2.close(); - QMatrix* m = new QMatrix ( device, @@ -119,14 +109,6 @@ void gemm_half_q_half bool force_cuda ) { - - //throw std::invalid_argument("a or b negative"); - - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile << "start gemm_half_q_half" << "\n"; - myfile.flush(); - myfile.close(); - QMatrix* qm = reinterpret_cast (b); TORCH_CHECK_DTYPE(a, kHalf); @@ -137,8 +119,6 @@ void gemm_half_q_half const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - //myfile << "call gemm_half_q_half_cuda" << "\n"; - //myfile.flush(); gemm_half_q_half_cuda ( at::cuda::getCurrentCUDABlasHandle(), diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp deleted file mode 100644 index 45f87a7b..00000000 --- a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#include "hip/q_matrix.cuh" -#include "hip/q_gemm.cuh" - -#include "cpp/util.h" - -#include -#include -using namespace std; - -// Some decluttering macros - -#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") - - -// Quant matrix - -uintptr_t make_q_matrix -( - torch::Tensor q_weight, - torch::Tensor q_perm, - torch::Tensor q_invperm, - torch::Tensor q_scale, - torch::Tensor q_scale_max, - torch::Tensor q_groups, - torch::Tensor q_group_map, - torch::Tensor gptq_qzeros, - torch::Tensor gptq_scales, - torch::Tensor gptq_g_idx, - torch::Tensor temp_dq -) -{ - TORCH_CHECK_DTYPE(q_weight, kInt); - TORCH_CHECK_DTYPE_OPT(q_perm, kShort); - TORCH_CHECK_DTYPE_OPT(q_invperm, kShort); - TORCH_CHECK_DTYPE_OPT(q_scale, kInt); - TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf); - TORCH_CHECK_DTYPE_OPT(q_groups, kShort); - TORCH_CHECK_DTYPE_OPT(q_group_map, kShort); - TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt); - TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf); - TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt); - - TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1); - - int device = q_weight.device().index(); - int width = q_weight.size(1); - int groups; - int height; - - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile << "in make_q_matrix" << "\n"; - myfile.flush(); - myfile.close(); - - if (!q_scale.device().is_meta()) - { - TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8); - TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1); - groups = q_scale.size(0); - height = q_invperm.size(0); - } - else - { - TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8); - TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1); - groups = gptq_qzeros.size(0); - height = q_weight.size(0) * 8; - } - - TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") - - ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n"; - myfile2.flush(); - myfile2.close(); - - QMatrix* m = new QMatrix - ( - device, - height, - width, - groups, - (uint32_t*) q_weight.data_ptr(), - q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(), - q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(), - q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(), - q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(), - q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(), - q_group_map.device().is_meta() ? NULL : (uint16_t*) q_group_map.data_ptr(), - gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(), - gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(), - gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(), - (half*) temp_dq.data_ptr() - ); - - if (m->failed) throw std::runtime_error("CUDA out of memory"); - - return reinterpret_cast (m); -} - -void gemm_half_q_half -( - torch::Tensor a, - uintptr_t b, - torch::Tensor c, - bool force_cuda -) -{ - - //throw std::invalid_argument("a or b negative"); - - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); - myfile << "start gemm_half_q_half" << "\n"; - myfile.flush(); - myfile.close(); - - QMatrix* qm = reinterpret_cast (b); - - TORCH_CHECK_DTYPE(a, kHalf); - TORCH_CHECK_DTYPE(c, kHalf); - TORCH_CHECK_SHAPES(a, 0, c, 0, 1); - TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes") - TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes") - - const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a)); - - //myfile << "call gemm_half_q_half_cuda" << "\n"; - //myfile.flush(); - gemm_half_q_half_cuda - ( - at::cuda::getCurrentCUDABlasHandle(), - (const half*) a.data_ptr(), - qm, - (half*) c.data_ptr(), - c.size(0), // m - c.size(1), // n - a.size(1), // k - true, - NULL, - force_cuda - ); -} - -// Bindings - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("make_q_matrix", &make_q_matrix, "make_q_matrix"); - m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half"); -} diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 7d44af1b..a73dd68e 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -31,7 +31,6 @@ from text_generation_server.utils.dist import MEMORY_FRACTION tracer = trace.get_tracer(__name__) -from loguru import logger @dataclass class FlashCausalLMBatch(Batch): @@ -680,35 +679,28 @@ class FlashCausalLM(Model): return FlashCausalLMBatch def warmup(self, batch: FlashCausalLMBatch): - logger.info("in this warmup start") - torch.cuda.empty_cache() - logger.info("in this warmup after empty cache") - #try: - cache_manager = set_cache_manager( - batch.blocks, - self.num_layers, - self.num_kv_heads, - self.head_size, - self.sliding_window is not None, - self.dtype, - self.device, - ) - logger.info("in this warmup after set_cache_manager") - _, batch, _ = self.generate_token(batch) - logger.info("in this warmup after generate_token") - # except torch.cuda.OutOfMemoryError as e: - # raise RuntimeError( - # f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " - # f"You need to decrease `--max-batch-prefill-tokens`" - # ) from e + try: + cache_manager = set_cache_manager( + batch.blocks, + self.num_layers, + self.num_kv_heads, + self.head_size, + self.sliding_window is not None, + self.dtype, + self.device, + ) + _, batch, _ = self.generate_token(batch) + except torch.cuda.OutOfMemoryError as e: + raise RuntimeError( + f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " + f"You need to decrease `--max-batch-prefill-tokens`" + ) from e torch.cuda.synchronize(self.device) - logger.info("in this warmup after sync") - # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm) # Calculate the number of blocks that can be allocated with the free memory dtype_size = torch.tensor([], dtype=self.dtype).element_size() @@ -828,14 +820,11 @@ class FlashCausalLM(Model): batch.block_tables_tensor = block_tables_tensor batch.slots = slots - logger.info("callign forward in generate_token") - out = self.forward(batch) - # try: - # out = self.forward(batch) - # except Exception as e: - # del batch - # raise e - logger.info("finished forward in generate_token") + try: + out = self.forward(batch) + except Exception as e: + del batch + raise e if isinstance(out, tuple): out, speculative_logits = out diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 5873bbef..d5adbd32 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -63,7 +63,6 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) async def Warmup(self, request, context): - logger.info("IN WARMUP") if self.quantize == "gptq": try: # When using GPTQ, Exllama kernels need some global kernels @@ -79,7 +78,6 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): except ImportError: pass - logger.info("after quantize == gptq") if ( self.model.batch_type == IdeficsCausalLMBatch ): # Hack, i would rather use kwargs in the `from_pb` call @@ -94,10 +92,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): batch = self.model.batch_type.from_pb( request.batch, self.model.tokenizer, self.model.dtype, self.model.device ) - logger.info("calling model.warmup") max_supported_total_tokens = self.model.warmup(batch) - logger.info("end warmup") return generate_pb2.WarmupResponse( max_supported_total_tokens=max_supported_total_tokens )