text-generation-inference/server/Makefile-marlin
Daniël de Kok 4594e6faba Add support for Marlin-quantized models
This change adds support for Marlin-quantized models. Marlin is an
FP16xINT4 matmul kernel, which provides good speedups decoding batches
of 16-32 tokens. It supports quantized models with symmetric
quantization, groupsize -1 or 128, and 4-bit.

Tested with:

- Llama 2
- Llama 3
- Phi 3
2024-06-06 13:16:52 +02:00

14 lines
349 B
Plaintext

marlin_commit := 2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c
marlin:
# Clone marlin
pip install packaging
git clone https://github.com/IST-DASLab/marlin.git marlin
build-marlin: marlin
cd marlin && git fetch && git checkout $(marlin_commit)
cd marlin && python setup.py build
install-marlin: build-marlin
cd marlin && python setup.py install