mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-05-01 06:52:11 +00:00
# What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> This PR adds the possibility to run AWQ models with Exllama/GPTQ kernels, specifically for ROCm devices that support Exllama kernels but not AWQ's GEMM. This is done by : - un-packing, reordering and re-packing AWQ weights when `--quantize gptq` but the model's `quant_method=awq`. - avoiding overflows when adding 1 to zeros in exllama and triton. Ref: https://github.com/casper-hansen/AutoAWQ/pull/313 ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
217 lines
5.4 KiB
Plaintext
217 lines
5.4 KiB
Plaintext
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
|
|
|
#include "q4_matrix.cuh"
|
|
#include <vector>
|
|
#include "../util.cuh"
|
|
#include "../matrix.cuh"
|
|
|
|
using namespace std;
|
|
|
|
const int UNSHUF_BLOCKSIZE_X = 64;
|
|
|
|
const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column
|
|
const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows
|
|
|
|
vector<Q4Matrix*> g_q4_matrices;
|
|
|
|
void g_q4_keep_matrix(Q4Matrix* m)
|
|
{
|
|
g_q4_matrices.push_back(m);
|
|
}
|
|
|
|
void g_q4_free_matrices()
|
|
{
|
|
for (const auto& m : g_q4_matrices) delete m;
|
|
g_q4_matrices.clear();
|
|
}
|
|
|
|
Q4Matrix::Q4Matrix
|
|
(
|
|
const int _height,
|
|
const int _width,
|
|
const int _groups,
|
|
|
|
uint32_t* _qweight,
|
|
uint32_t* _qzeros,
|
|
half* _scales,
|
|
uint32_t* _g_idx,
|
|
|
|
const int _device
|
|
) :
|
|
height(_height),
|
|
width(_width),
|
|
groups(_groups),
|
|
device(_device)
|
|
{
|
|
cudaSetDevice(device);
|
|
|
|
cuda_qweight = _qweight;
|
|
cuda_qzeros = _qzeros;
|
|
cuda_scales = _scales;
|
|
|
|
groupsize = height / groups;
|
|
|
|
if (_g_idx) make_sequential(_g_idx);
|
|
}
|
|
|
|
Q4Matrix::~Q4Matrix()
|
|
{
|
|
}
|
|
|
|
// Make sequential
|
|
|
|
__global__ void make_sequential_kernel
|
|
(
|
|
const uint32_t* __restrict__ w,
|
|
uint32_t* __restrict__ w_new,
|
|
const uint32_t* __restrict__ x_map,
|
|
const int w_height,
|
|
const int w_width
|
|
)
|
|
{
|
|
const uint64_t* w2 = (uint64_t*) w;
|
|
uint64_t* w_new2 = (uint64_t*) w_new;
|
|
int w2_stride = w_width >> 1;
|
|
|
|
int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
|
|
int w_new2_row = blockIdx.y;
|
|
|
|
int x_map_idx = w_new2_row << 3;
|
|
|
|
uint64_t dst = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < 8; i++)
|
|
{
|
|
int source_row = x_map[x_map_idx++];
|
|
|
|
int w2_row = source_row >> 3;
|
|
int w2_subrow = source_row & 0x07;
|
|
int w2_row_shift = w2_subrow << 2;
|
|
int wnew2_row_shift = i << 2;
|
|
|
|
uint64_t src = w2[w2_row * w2_stride + w2_column];
|
|
src >>= w2_row_shift;
|
|
src &= 0x0000000f0000000f;
|
|
src <<= wnew2_row_shift;
|
|
dst |= src;
|
|
}
|
|
|
|
w_new2[w_new2_row * w2_stride + w2_column] = dst;
|
|
}
|
|
|
|
void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
|
|
{
|
|
uint32_t* cuda_new_qweight = NULL;
|
|
cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
|
|
cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch
|
|
|
|
uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
|
|
uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
|
|
uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
|
|
|
|
// Group histogram
|
|
|
|
for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
|
|
|
|
// Group map
|
|
|
|
for (int i = 0, acc = 0; i < groups; i++)
|
|
{
|
|
short tmp = cpu_g_idx_map[i];
|
|
cpu_g_idx_map[i] = acc;
|
|
acc += tmp;
|
|
}
|
|
|
|
// X map (inverse)
|
|
|
|
for (int row = 0; row < height; row++)
|
|
{
|
|
uint32_t target_group = cpu_g_idx[row];
|
|
uint32_t target_row = cpu_g_idx_map[target_group];
|
|
cpu_g_idx_map[target_group]++;
|
|
cpu_x_map_inv[row] = target_row;
|
|
}
|
|
|
|
// X map
|
|
|
|
for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
|
|
|
|
// Move to CUDA
|
|
|
|
cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
|
|
|
|
// Rearrange rows in w
|
|
|
|
dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
|
|
dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);
|
|
|
|
make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
|
|
|
|
// Replace qweights
|
|
|
|
cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
|
|
|
|
// Cleanup
|
|
|
|
cudaDeviceSynchronize();
|
|
cudaFree(cuda_new_qweight);
|
|
free(cpu_g_idx_map);
|
|
free(cpu_x_map);
|
|
free(cpu_x_map_inv);
|
|
}
|
|
|
|
__global__ void reconstruct_kernel
|
|
(
|
|
const uint32_t* __restrict__ w,
|
|
half* __restrict__ out, // (y)
|
|
const half* __restrict__ w_scales,
|
|
const uint32_t* __restrict__ w_zeros,
|
|
const int height,
|
|
const int width,
|
|
const int groupsize
|
|
)
|
|
{
|
|
// Start of block
|
|
|
|
int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
|
|
int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
|
|
|
|
// Views
|
|
|
|
MatrixView_q4_column w_(w, height, width);
|
|
MatrixView_half_rw out_(out, height, width);
|
|
MatrixView_half w_scales_(w_scales, height / groupsize, width);
|
|
MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
|
|
|
|
// Groupsize version
|
|
|
|
int group = row / groupsize;
|
|
|
|
half w_scale = w_scales_.item(group, column);
|
|
uint32_t w_zero = (w_zeros_.item(group, column) + 1) & 0x0F;
|
|
|
|
uint32_t w_read = w_.item_uint32_t(row, column);
|
|
half* out_ptr = out_.item_ptr(row, column);
|
|
|
|
#pragma unroll
|
|
for (int s = 0; s < 32; s += 4)
|
|
{
|
|
half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
|
|
*out_ptr = w_item; out_ptr += out_.width;
|
|
}
|
|
}
|
|
|
|
void Q4Matrix::reconstruct(half* out)
|
|
{
|
|
dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
|
|
|
|
dim3 blocks
|
|
(
|
|
(width + threads.x - 1) / threads.x,
|
|
(height / 8 + threads.y - 1) / threads.y,
|
|
1
|
|
);
|
|
|
|
reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
|
|
} |