Putting the deprecation notice on bnb (8bit).

This commit is contained in:
Nicolas Patry 2023-09-27 08:39:40 +00:00
parent 2d13b6ff6c
commit 085c43243d
4 changed files with 31 additions and 3829 deletions

3820
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -21,12 +21,29 @@ mod env_runtime;
#[derive(Clone, Copy, Debug, ValueEnum)] #[derive(Clone, Copy, Debug, ValueEnum)]
enum Quantization { enum Quantization {
Bitsandbytes, /// 4 bit quantization. Requires a specific GTPQ quantized model:
BitsandbytesNF4, /// https://hf.co/models?search=awq.
BitsandbytesFP4, /// Should replace GPTQ models whereever possible because of the better latency
Gptq,
Awq, Awq,
/// 8 bit quantization, doesn't require specific model.
/// Should be a drop-in replacement to bitsandbytes with much better performance.
/// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
Eetq, Eetq,
/// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
/// text-generation-inference will use exllama (faster) kernels whereever possible, and use
/// triton kernel (wider support) when it's not.
/// AWQ has faster kernels.
Gptq,
/// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
/// but it is known that the model will be much slower to run than the native f16.
#[deprecated(since="1.1.0", note="Use `eetq` instead, which provides better latencies overall and is drop-in in most cases")]
Bitsandbytes,
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
/// but it is known that the model will be much slower to run than the native f16.
BitsandbytesNF4,
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
/// perplexity performance for you model
BitsandbytesFP4,
} }
impl std::fmt::Display for Quantization { impl std::fmt::Display for Quantization {
@ -47,6 +64,7 @@ impl std::fmt::Display for Quantization {
} }
Quantization::Awq => { Quantization::Awq => {
write!(f, "awq") write!(f, "awq")
}
Quantization::Eetq => { Quantization::Eetq => {
write!(f, "eetq") write!(f, "eetq")
} }
@ -130,9 +148,7 @@ struct Args {
#[clap(long, env)] #[clap(long, env)]
num_shard: Option<usize>, num_shard: Option<usize>,
/// Whether you want the model to be quantized. This will use `bitsandbytes` for /// Whether you want the model to be quantized.
/// quantization on the fly, or `gptq`. 4bit quantization is available through
/// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
#[clap(long, env, value_enum)] #[clap(long, env, value_enum)]
quantize: Option<Quantization>, quantize: Option<Quantization>,

1
server/.gitignore vendored
View File

@ -160,3 +160,4 @@ flash-attention/
flash-attention-v2/ flash-attention-v2/
vllm/ vllm/
llm-awq/ llm-awq/
eetq/

View File

@ -5,6 +5,8 @@ import torch.distributed
from torch import nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from typing import List from typing import List
from loguru import logger
from functools import lru_cache
HAS_BITS_AND_BYTES = True HAS_BITS_AND_BYTES = True
try: try:
@ -242,6 +244,10 @@ class Linear4bit(nn.Module):
return out return out
@lru_cache(1)
def warn_deprecate_bnb():
logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")
def get_linear(weight, bias, quantize): def get_linear(weight, bias, quantize):
if quantize is None: if quantize is None:
linear = FastLinear(weight, bias) linear = FastLinear(weight, bias)
@ -251,8 +257,7 @@ def get_linear(weight, bias, quantize):
else: else:
raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ") raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
elif quantize == "bitsandbytes": elif quantize == "bitsandbytes":
import warnings warn_deprecate_bnb()
warnings.warn("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce", DeprecationWarning)
linear = Linear8bitLt( linear = Linear8bitLt(
weight, weight,
bias, bias,