mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
Putting the deprecation notice on bnb (8bit).
This commit is contained in:
parent
2d13b6ff6c
commit
085c43243d
3820
Cargo.lock
generated
3820
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -21,12 +21,29 @@ mod env_runtime;
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum Quantization {
|
||||
Bitsandbytes,
|
||||
BitsandbytesNF4,
|
||||
BitsandbytesFP4,
|
||||
Gptq,
|
||||
/// 4 bit quantization. Requires a specific GTPQ quantized model:
|
||||
/// https://hf.co/models?search=awq.
|
||||
/// Should replace GPTQ models whereever possible because of the better latency
|
||||
Awq,
|
||||
/// 8 bit quantization, doesn't require specific model.
|
||||
/// Should be a drop-in replacement to bitsandbytes with much better performance.
|
||||
/// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
|
||||
Eetq,
|
||||
/// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
|
||||
/// text-generation-inference will use exllama (faster) kernels whereever possible, and use
|
||||
/// triton kernel (wider support) when it's not.
|
||||
/// AWQ has faster kernels.
|
||||
Gptq,
|
||||
/// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
|
||||
/// but it is known that the model will be much slower to run than the native f16.
|
||||
#[deprecated(since="1.1.0", note="Use `eetq` instead, which provides better latencies overall and is drop-in in most cases")]
|
||||
Bitsandbytes,
|
||||
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
|
||||
/// but it is known that the model will be much slower to run than the native f16.
|
||||
BitsandbytesNF4,
|
||||
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
|
||||
/// perplexity performance for you model
|
||||
BitsandbytesFP4,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Quantization {
|
||||
@ -47,6 +64,7 @@ impl std::fmt::Display for Quantization {
|
||||
}
|
||||
Quantization::Awq => {
|
||||
write!(f, "awq")
|
||||
}
|
||||
Quantization::Eetq => {
|
||||
write!(f, "eetq")
|
||||
}
|
||||
@ -130,9 +148,7 @@ struct Args {
|
||||
#[clap(long, env)]
|
||||
num_shard: Option<usize>,
|
||||
|
||||
/// Whether you want the model to be quantized. This will use `bitsandbytes` for
|
||||
/// quantization on the fly, or `gptq`. 4bit quantization is available through
|
||||
/// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
|
||||
/// Whether you want the model to be quantized.
|
||||
#[clap(long, env, value_enum)]
|
||||
quantize: Option<Quantization>,
|
||||
|
||||
|
1
server/.gitignore
vendored
1
server/.gitignore
vendored
@ -160,3 +160,4 @@ flash-attention/
|
||||
flash-attention-v2/
|
||||
vllm/
|
||||
llm-awq/
|
||||
eetq/
|
||||
|
@ -5,6 +5,8 @@ import torch.distributed
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
from functools import lru_cache
|
||||
|
||||
HAS_BITS_AND_BYTES = True
|
||||
try:
|
||||
@ -242,6 +244,10 @@ class Linear4bit(nn.Module):
|
||||
return out
|
||||
|
||||
|
||||
@lru_cache(1)
|
||||
def warn_deprecate_bnb():
|
||||
logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")
|
||||
|
||||
def get_linear(weight, bias, quantize):
|
||||
if quantize is None:
|
||||
linear = FastLinear(weight, bias)
|
||||
@ -251,8 +257,7 @@ def get_linear(weight, bias, quantize):
|
||||
else:
|
||||
raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
|
||||
elif quantize == "bitsandbytes":
|
||||
import warnings
|
||||
warnings.warn("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce", DeprecationWarning)
|
||||
warn_deprecate_bnb()
|
||||
linear = Linear8bitLt(
|
||||
weight,
|
||||
bias,
|
||||
|
Loading…
Reference in New Issue
Block a user