diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh deleted file mode 100644 index 19b1e4a6..00000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _compat_gemm_cuh -#define _compat_gemm_cuh - -#if defined(USE_ROCM) - -// For some reason this include is not present anywhere in exllama_v2 codebase, but it is required -// for symbols as hipblasHalf. -#include - -__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, - hipblasOperation_t transA, - hipblasOperation_t transB, - int m, - int n, - int k, - const half* alpha, - const half* AP, - int lda, - const half* BP, - int ldb, - const half* beta, - half* CP, - int ldc) { - return hipblasHgemm(handle, transA, transB, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(AP), lda, - reinterpret_cast(BP), ldb, - reinterpret_cast(beta), - reinterpret_cast(CP), ldc); -} -#define hipblasHgemm __compat_hipblasHgemm - -// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. -#define rocblas_operation_none HIPBLAS_OP_N -#define rocblas_hgemm __compat_hipblasHgemm -#endif - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu index d73ce292..746a2a90 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu @@ -22,12 +22,11 @@ #include "q_gemm_kernel.cuh" #include "q_gemm_kernel_gptq.cuh" -#include "compat_gemm.cuh" #include #include using namespace std; - +#include void gemm_half_q_half_cuda_part ( const half* a, @@ -43,7 +42,7 @@ void gemm_half_q_half_cuda_part bool mul_r_weights ) { - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt"); + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); if (!b->is_gptq) { myfile << "go in is_gptq path" << "\n"; @@ -141,11 +140,14 @@ void gemm_half_q_half_cuda bool mul_r_weights ) { - ofstream myfile; - myfile.open ("/tgi/server/exllamav2_kernels/log.txt"); + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); if (size_m > MAX_Q_GEMM_ROWS && !force_cuda) { - myfile << "going in cublas path" << "\n"; + freopen("/dev/tty", "w", stdout); + freopen("/dev/tty", "w", stderr); + + std::cout << "going in cublas path" << "\n"; + // Reconstruct FP16 matrix, then cuBLAS if (!temp_dq) temp_dq = b->temp_dq; @@ -155,6 +157,34 @@ void gemm_half_q_half_cuda const half alpha = __float2half(1.0f); const half beta = clear ? __float2half(0.0f) : __float2half(1.0f); + + freopen("/dev/tty", "w", stdout); + freopen("/dev/tty", "w", stderr); + + std:cout << "hey it's me\n" << std::flush; + + //half* val = temp_dq + (size_n * size_k - 1) * sizeof(half); + //half* val = temp_dq + 1; + + // half* my_val_host; + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) + printf("Error before: %s\n", cudaGetErrorString(error)); + + cudaError_t err = cudaMemcpy(my_val_host, temp_dq, sizeof(half), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(err)); + + // float my_val_float = __half2float(*temp_dq); + //std::cout << "temp_dq: " << my_val_float << "\n" << std::flush; + + std::cout << "call cublasHgemm" << "\n"; + std::cout << "call cublasHgemm size_n" << size_n << "\n"; + std::cout << "call cublasHgemm size_m" << size_m << "\n"; + std::cout << "call cublasHgemm size_k" << size_k << "\n"; + std::cout << "call cublasHgemm b width" << b->width << "\n"; + std::cout << "call cublasHgemm b height" << b->height << "\n" << std::flush; cublasHgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, @@ -187,8 +217,8 @@ void gemm_half_q_half_cuda { // Quantized matmul - myfile << "going in gemm_half_q_half_cuda_part path" << "\n"; + std::cout << "going in gemm_half_q_half_cuda_part path" << "\n"; int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX; int max_chunks = size_m / block_m_size_max; @@ -197,17 +227,17 @@ void gemm_half_q_half_cuda if (max_chunks) { - myfile << "call gemm_half_q_half_cuda_part max_chunks" << "\n"; + std::cout << "call gemm_half_q_half_cuda_part max_chunks" << "\n"; + gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights); } if (last_chunk_size) { - myfile << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n"; + std::cout << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n"; gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights); } } - myfile.close(); } __global__ void clear_kernel diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu index ae08cc1f..b3e4bf68 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu @@ -14,6 +14,11 @@ #define THREADS_X 32 #define THREADS_Y 32 +#include +#include +using namespace std; + + // Shuffle quantized data on load __global__ void shuffle_kernel @@ -87,6 +92,7 @@ QMatrix::QMatrix is_gptq = (_gptq_qzeros != NULL); + std::cout << "is_gptq in QMatrix init " << is_gptq << "\n"; if (is_gptq) { gptq_groupsize = 1; @@ -501,6 +507,8 @@ void QMatrix::reconstruct(half* out) } else { + std::cout << "reconstructing with reconstruct_gptq_kernel" << "\n"; + gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4); reconstruct_gptq_kernel<<>> ( diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp index 3a4a4e14..17ed53ce 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp +++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp @@ -60,6 +60,11 @@ uintptr_t make_q_matrix int groups; int height; + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); + myfile << "in make_q_matrix" << "\n"; + myfile.flush(); + myfile.close(); + if (!q_scale.device().is_meta()) { TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8); @@ -77,6 +82,11 @@ uintptr_t make_q_matrix TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") + ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app); + myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n"; + myfile2.flush(); + myfile2.close(); + QMatrix* m = new QMatrix ( device, @@ -109,9 +119,13 @@ void gemm_half_q_half bool force_cuda ) { - ofstream myfile("/tgi/server/exllamav2_kernels/log.txt"); + + //throw std::invalid_argument("a or b negative"); + + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); myfile << "start gemm_half_q_half" << "\n"; myfile.flush(); + myfile.close(); QMatrix* qm = reinterpret_cast (b); @@ -123,8 +137,8 @@ void gemm_half_q_half const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - myfile << "call gemm_half_q_half_cuda" << "\n"; - myfile.flush(); + //myfile << "call gemm_half_q_half_cuda" << "\n"; + //myfile.flush(); gemm_half_q_half_cuda ( at::cuda::getCurrentCUDABlasHandle(), @@ -138,7 +152,6 @@ void gemm_half_q_half NULL, force_cuda ); - myfile.close(); } // Bindings diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp index 3a23fc18..45f87a7b 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp +++ b/server/exllamav2_kernels/exllamav2_kernels/ext_hip.cpp @@ -61,6 +61,11 @@ uintptr_t make_q_matrix int groups; int height; + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); + myfile << "in make_q_matrix" << "\n"; + myfile.flush(); + myfile.close(); + if (!q_scale.device().is_meta()) { TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8); @@ -78,6 +83,11 @@ uintptr_t make_q_matrix TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") + ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app); + myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n"; + myfile2.flush(); + myfile2.close(); + QMatrix* m = new QMatrix ( device, @@ -110,9 +120,13 @@ void gemm_half_q_half bool force_cuda ) { - ofstream myfile; - myfile.open ("/tgi/server/exllamav2_kernels/log.txt"); + + //throw std::invalid_argument("a or b negative"); + + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app); myfile << "start gemm_half_q_half" << "\n"; + myfile.flush(); + myfile.close(); QMatrix* qm = reinterpret_cast (b); @@ -124,7 +138,8 @@ void gemm_half_q_half const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a)); - myfile << "call gemm_half_q_half_cuda" << "\n"; + //myfile << "call gemm_half_q_half_cuda" << "\n"; + //myfile.flush(); gemm_half_q_half_cuda ( at::cuda::getCurrentCUDABlasHandle(), @@ -138,7 +153,6 @@ void gemm_half_q_half NULL, force_cuda ); - myfile.close(); } // Bindings diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py index 518db1df..4a16b546 100644 --- a/server/exllamav2_kernels/setup.py +++ b/server/exllamav2_kernels/setup.py @@ -1,5 +1,15 @@ from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension +import torch + +extra_cuda_cflags = ["-lineinfo", "-O3"] + +if torch.version.hip: + extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF"] + +extra_compile_args = { + "nvcc": extra_cuda_cflags, +} setup( name="exllamav2_kernels", @@ -11,6 +21,7 @@ setup( "exllamav2_kernels/cuda/q_matrix.cu", "exllamav2_kernels/cuda/q_gemm.cu", ], + extra_compile_args=extra_compile_args, ) ], cmdclass={"build_ext": BuildExtension},