mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
more logs
This commit is contained in:
parent
3c93b31959
commit
145c2d6d6e
@ -1,38 +0,0 @@
|
|||||||
#ifndef _compat_gemm_cuh
|
|
||||||
#define _compat_gemm_cuh
|
|
||||||
|
|
||||||
#if defined(USE_ROCM)
|
|
||||||
|
|
||||||
// For some reason this include is not present anywhere in exllama_v2 codebase, but it is required
|
|
||||||
// for symbols as hipblasHalf.
|
|
||||||
#include <hipblas/hipblas.h>
|
|
||||||
|
|
||||||
__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle,
|
|
||||||
hipblasOperation_t transA,
|
|
||||||
hipblasOperation_t transB,
|
|
||||||
int m,
|
|
||||||
int n,
|
|
||||||
int k,
|
|
||||||
const half* alpha,
|
|
||||||
const half* AP,
|
|
||||||
int lda,
|
|
||||||
const half* BP,
|
|
||||||
int ldb,
|
|
||||||
const half* beta,
|
|
||||||
half* CP,
|
|
||||||
int ldc) {
|
|
||||||
return hipblasHgemm(handle, transA, transB, m, n, k,
|
|
||||||
reinterpret_cast<const hipblasHalf *>(alpha),
|
|
||||||
reinterpret_cast<const hipblasHalf *>(AP), lda,
|
|
||||||
reinterpret_cast<const hipblasHalf *>(BP), ldb,
|
|
||||||
reinterpret_cast<const hipblasHalf *>(beta),
|
|
||||||
reinterpret_cast<hipblasHalf *>(CP), ldc);
|
|
||||||
}
|
|
||||||
#define hipblasHgemm __compat_hipblasHgemm
|
|
||||||
|
|
||||||
// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
|
|
||||||
#define rocblas_operation_none HIPBLAS_OP_N
|
|
||||||
#define rocblas_hgemm __compat_hipblasHgemm
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
@ -22,12 +22,11 @@
|
|||||||
|
|
||||||
#include "q_gemm_kernel.cuh"
|
#include "q_gemm_kernel.cuh"
|
||||||
#include "q_gemm_kernel_gptq.cuh"
|
#include "q_gemm_kernel_gptq.cuh"
|
||||||
#include "compat_gemm.cuh"
|
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
#include <stdio.h>
|
||||||
void gemm_half_q_half_cuda_part
|
void gemm_half_q_half_cuda_part
|
||||||
(
|
(
|
||||||
const half* a,
|
const half* a,
|
||||||
@ -43,7 +42,7 @@ void gemm_half_q_half_cuda_part
|
|||||||
bool mul_r_weights
|
bool mul_r_weights
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt");
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
if (!b->is_gptq)
|
if (!b->is_gptq)
|
||||||
{
|
{
|
||||||
myfile << "go in is_gptq path" << "\n";
|
myfile << "go in is_gptq path" << "\n";
|
||||||
@ -141,11 +140,14 @@ void gemm_half_q_half_cuda
|
|||||||
bool mul_r_weights
|
bool mul_r_weights
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
ofstream myfile;
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
myfile.open ("/tgi/server/exllamav2_kernels/log.txt");
|
|
||||||
if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
|
if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
|
||||||
{
|
{
|
||||||
myfile << "going in cublas path" << "\n";
|
freopen("/dev/tty", "w", stdout);
|
||||||
|
freopen("/dev/tty", "w", stderr);
|
||||||
|
|
||||||
|
std::cout << "going in cublas path" << "\n";
|
||||||
|
|
||||||
// Reconstruct FP16 matrix, then cuBLAS
|
// Reconstruct FP16 matrix, then cuBLAS
|
||||||
|
|
||||||
if (!temp_dq) temp_dq = b->temp_dq;
|
if (!temp_dq) temp_dq = b->temp_dq;
|
||||||
@ -155,6 +157,34 @@ void gemm_half_q_half_cuda
|
|||||||
|
|
||||||
const half alpha = __float2half(1.0f);
|
const half alpha = __float2half(1.0f);
|
||||||
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
|
const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
|
||||||
|
|
||||||
|
freopen("/dev/tty", "w", stdout);
|
||||||
|
freopen("/dev/tty", "w", stderr);
|
||||||
|
|
||||||
|
std:cout << "hey it's me\n" << std::flush;
|
||||||
|
|
||||||
|
//half* val = temp_dq + (size_n * size_k - 1) * sizeof(half);
|
||||||
|
//half* val = temp_dq + 1;
|
||||||
|
|
||||||
|
// half* my_val_host;
|
||||||
|
|
||||||
|
cudaError_t error = cudaGetLastError();
|
||||||
|
if (error != cudaSuccess)
|
||||||
|
printf("Error before: %s\n", cudaGetErrorString(error));
|
||||||
|
|
||||||
|
cudaError_t err = cudaMemcpy(my_val_host, temp_dq, sizeof(half), cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
printf("Error in cudaMemcpy: %s\n", cudaGetErrorString(err));
|
||||||
|
|
||||||
|
// float my_val_float = __half2float(*temp_dq);
|
||||||
|
//std::cout << "temp_dq: " << my_val_float << "\n" << std::flush;
|
||||||
|
|
||||||
|
std::cout << "call cublasHgemm" << "\n";
|
||||||
|
std::cout << "call cublasHgemm size_n" << size_n << "\n";
|
||||||
|
std::cout << "call cublasHgemm size_m" << size_m << "\n";
|
||||||
|
std::cout << "call cublasHgemm size_k" << size_k << "\n";
|
||||||
|
std::cout << "call cublasHgemm b width" << b->width << "\n";
|
||||||
|
std::cout << "call cublasHgemm b height" << b->height << "\n" << std::flush;
|
||||||
cublasHgemm(cublas_handle,
|
cublasHgemm(cublas_handle,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
CUBLAS_OP_N,
|
CUBLAS_OP_N,
|
||||||
@ -187,8 +217,8 @@ void gemm_half_q_half_cuda
|
|||||||
{
|
{
|
||||||
// Quantized matmul
|
// Quantized matmul
|
||||||
|
|
||||||
myfile << "going in gemm_half_q_half_cuda_part path" << "\n";
|
|
||||||
|
|
||||||
|
std::cout << "going in gemm_half_q_half_cuda_part path" << "\n";
|
||||||
|
|
||||||
int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
|
int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
|
||||||
int max_chunks = size_m / block_m_size_max;
|
int max_chunks = size_m / block_m_size_max;
|
||||||
@ -197,17 +227,17 @@ void gemm_half_q_half_cuda
|
|||||||
|
|
||||||
if (max_chunks)
|
if (max_chunks)
|
||||||
{
|
{
|
||||||
myfile << "call gemm_half_q_half_cuda_part max_chunks" << "\n";
|
std::cout << "call gemm_half_q_half_cuda_part max_chunks" << "\n";
|
||||||
|
|
||||||
gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
|
gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (last_chunk_size)
|
if (last_chunk_size)
|
||||||
{
|
{
|
||||||
myfile << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n";
|
std::cout << "call gemm_half_q_half_cuda_part last_chunk_size" << "\n";
|
||||||
gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
|
gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
myfile.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void clear_kernel
|
__global__ void clear_kernel
|
||||||
|
@ -14,6 +14,11 @@
|
|||||||
#define THREADS_X 32
|
#define THREADS_X 32
|
||||||
#define THREADS_Y 32
|
#define THREADS_Y 32
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
// Shuffle quantized data on load
|
// Shuffle quantized data on load
|
||||||
|
|
||||||
__global__ void shuffle_kernel
|
__global__ void shuffle_kernel
|
||||||
@ -87,6 +92,7 @@ QMatrix::QMatrix
|
|||||||
|
|
||||||
is_gptq = (_gptq_qzeros != NULL);
|
is_gptq = (_gptq_qzeros != NULL);
|
||||||
|
|
||||||
|
std::cout << "is_gptq in QMatrix init " << is_gptq << "\n";
|
||||||
if (is_gptq)
|
if (is_gptq)
|
||||||
{
|
{
|
||||||
gptq_groupsize = 1;
|
gptq_groupsize = 1;
|
||||||
@ -501,6 +507,8 @@ void QMatrix::reconstruct(half* out)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
std::cout << "reconstructing with reconstruct_gptq_kernel" << "\n";
|
||||||
|
|
||||||
gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
|
gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
|
||||||
reconstruct_gptq_kernel<<<gridDim, blockDim>>>
|
reconstruct_gptq_kernel<<<gridDim, blockDim>>>
|
||||||
(
|
(
|
||||||
|
@ -60,6 +60,11 @@ uintptr_t make_q_matrix
|
|||||||
int groups;
|
int groups;
|
||||||
int height;
|
int height;
|
||||||
|
|
||||||
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
|
myfile << "in make_q_matrix" << "\n";
|
||||||
|
myfile.flush();
|
||||||
|
myfile.close();
|
||||||
|
|
||||||
if (!q_scale.device().is_meta())
|
if (!q_scale.device().is_meta())
|
||||||
{
|
{
|
||||||
TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
|
TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
|
||||||
@ -77,6 +82,11 @@ uintptr_t make_q_matrix
|
|||||||
|
|
||||||
TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
|
TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
|
||||||
|
|
||||||
|
ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
|
myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
|
||||||
|
myfile2.flush();
|
||||||
|
myfile2.close();
|
||||||
|
|
||||||
QMatrix* m = new QMatrix
|
QMatrix* m = new QMatrix
|
||||||
(
|
(
|
||||||
device,
|
device,
|
||||||
@ -109,9 +119,13 @@ void gemm_half_q_half
|
|||||||
bool force_cuda
|
bool force_cuda
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt");
|
|
||||||
|
//throw std::invalid_argument("a or b negative");
|
||||||
|
|
||||||
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
myfile << "start gemm_half_q_half" << "\n";
|
myfile << "start gemm_half_q_half" << "\n";
|
||||||
myfile.flush();
|
myfile.flush();
|
||||||
|
myfile.close();
|
||||||
|
|
||||||
QMatrix* qm = reinterpret_cast<QMatrix*> (b);
|
QMatrix* qm = reinterpret_cast<QMatrix*> (b);
|
||||||
|
|
||||||
@ -123,8 +137,8 @@ void gemm_half_q_half
|
|||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
|
||||||
|
|
||||||
myfile << "call gemm_half_q_half_cuda" << "\n";
|
//myfile << "call gemm_half_q_half_cuda" << "\n";
|
||||||
myfile.flush();
|
//myfile.flush();
|
||||||
gemm_half_q_half_cuda
|
gemm_half_q_half_cuda
|
||||||
(
|
(
|
||||||
at::cuda::getCurrentCUDABlasHandle(),
|
at::cuda::getCurrentCUDABlasHandle(),
|
||||||
@ -138,7 +152,6 @@ void gemm_half_q_half
|
|||||||
NULL,
|
NULL,
|
||||||
force_cuda
|
force_cuda
|
||||||
);
|
);
|
||||||
myfile.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bindings
|
// Bindings
|
||||||
|
@ -61,6 +61,11 @@ uintptr_t make_q_matrix
|
|||||||
int groups;
|
int groups;
|
||||||
int height;
|
int height;
|
||||||
|
|
||||||
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
|
myfile << "in make_q_matrix" << "\n";
|
||||||
|
myfile.flush();
|
||||||
|
myfile.close();
|
||||||
|
|
||||||
if (!q_scale.device().is_meta())
|
if (!q_scale.device().is_meta())
|
||||||
{
|
{
|
||||||
TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
|
TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
|
||||||
@ -78,6 +83,11 @@ uintptr_t make_q_matrix
|
|||||||
|
|
||||||
TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
|
TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
|
||||||
|
|
||||||
|
ofstream myfile2("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
|
myfile2 << "q_scale is meta" << q_scale.device().is_meta() << "\n";
|
||||||
|
myfile2.flush();
|
||||||
|
myfile2.close();
|
||||||
|
|
||||||
QMatrix* m = new QMatrix
|
QMatrix* m = new QMatrix
|
||||||
(
|
(
|
||||||
device,
|
device,
|
||||||
@ -110,9 +120,13 @@ void gemm_half_q_half
|
|||||||
bool force_cuda
|
bool force_cuda
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
ofstream myfile;
|
|
||||||
myfile.open ("/tgi/server/exllamav2_kernels/log.txt");
|
//throw std::invalid_argument("a or b negative");
|
||||||
|
|
||||||
|
ofstream myfile("/tgi/server/exllamav2_kernels/log.txt", ios::app);
|
||||||
myfile << "start gemm_half_q_half" << "\n";
|
myfile << "start gemm_half_q_half" << "\n";
|
||||||
|
myfile.flush();
|
||||||
|
myfile.close();
|
||||||
|
|
||||||
QMatrix* qm = reinterpret_cast<QMatrix*> (b);
|
QMatrix* qm = reinterpret_cast<QMatrix*> (b);
|
||||||
|
|
||||||
@ -124,7 +138,8 @@ void gemm_half_q_half
|
|||||||
|
|
||||||
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a));
|
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(a));
|
||||||
|
|
||||||
myfile << "call gemm_half_q_half_cuda" << "\n";
|
//myfile << "call gemm_half_q_half_cuda" << "\n";
|
||||||
|
//myfile.flush();
|
||||||
gemm_half_q_half_cuda
|
gemm_half_q_half_cuda
|
||||||
(
|
(
|
||||||
at::cuda::getCurrentCUDABlasHandle(),
|
at::cuda::getCurrentCUDABlasHandle(),
|
||||||
@ -138,7 +153,6 @@ void gemm_half_q_half
|
|||||||
NULL,
|
NULL,
|
||||||
force_cuda
|
force_cuda
|
||||||
);
|
);
|
||||||
myfile.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bindings
|
// Bindings
|
||||||
|
@ -1,5 +1,15 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
||||||
|
import torch
|
||||||
|
|
||||||
|
extra_cuda_cflags = ["-lineinfo", "-O3"]
|
||||||
|
|
||||||
|
if torch.version.hip:
|
||||||
|
extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF"]
|
||||||
|
|
||||||
|
extra_compile_args = {
|
||||||
|
"nvcc": extra_cuda_cflags,
|
||||||
|
}
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="exllamav2_kernels",
|
name="exllamav2_kernels",
|
||||||
@ -11,6 +21,7 @@ setup(
|
|||||||
"exllamav2_kernels/cuda/q_matrix.cu",
|
"exllamav2_kernels/cuda/q_matrix.cu",
|
||||||
"exllamav2_kernels/cuda/q_gemm.cu",
|
"exllamav2_kernels/cuda/q_gemm.cu",
|
||||||
],
|
],
|
||||||
|
extra_compile_args=extra_compile_args,
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
cmdclass={"build_ext": BuildExtension},
|
cmdclass={"build_ext": BuildExtension},
|
||||||
|
Loading…
Reference in New Issue
Block a user