diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu index 589ff72c..d73ce292 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu +++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu @@ -43,11 +43,11 @@ void gemm_half_q_half_cuda_part bool mul_r_weights ) { - ofstream myfile; - myfile.open ("/tgi/server/exllamav2_kernels/log.txt"); + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt"); if (!b->is_gptq) { myfile << "go in is_gptq path" << "\n"; + myfile.flush(); dim3 blockDim, gridDim; blockDim.x = EXL2_BLOCK_KN_SIZE; blockDim.y = 1; @@ -59,6 +59,7 @@ void gemm_half_q_half_cuda_part fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights); myfile << "launch kernel" << "\n"; + myfile.flush(); kernel<<>> ( a, @@ -119,6 +120,7 @@ void gemm_half_q_half_cuda_part r_weights_stride ); } + myfile.flush(); myfile.close(); } diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp index a78ce63f..3a4a4e14 100644 --- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp +++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp @@ -109,9 +109,9 @@ void gemm_half_q_half bool force_cuda ) { - ofstream myfile; - myfile.open ("/tgi/server/exllamav2_kernels/log.txt"); + ofstream myfile("/tgi/server/exllamav2_kernels/log.txt"); myfile << "start gemm_half_q_half" << "\n"; + myfile.flush(); QMatrix* qm = reinterpret_cast (b); @@ -124,6 +124,7 @@ void gemm_half_q_half const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); myfile << "call gemm_half_q_half_cuda" << "\n"; + myfile.flush(); gemm_half_q_half_cuda ( at::cuda::getCurrentCUDABlasHandle(),