mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Putting the deprecation notice on bnb (8bit).
This commit is contained in:
parent
2d13b6ff6c
commit
085c43243d
3820
Cargo.lock
generated
3820
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -21,12 +21,29 @@ mod env_runtime;
|
|||||||
|
|
||||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||||
enum Quantization {
|
enum Quantization {
|
||||||
Bitsandbytes,
|
/// 4 bit quantization. Requires a specific GTPQ quantized model:
|
||||||
BitsandbytesNF4,
|
/// https://hf.co/models?search=awq.
|
||||||
BitsandbytesFP4,
|
/// Should replace GPTQ models whereever possible because of the better latency
|
||||||
Gptq,
|
|
||||||
Awq,
|
Awq,
|
||||||
|
/// 8 bit quantization, doesn't require specific model.
|
||||||
|
/// Should be a drop-in replacement to bitsandbytes with much better performance.
|
||||||
|
/// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
|
||||||
Eetq,
|
Eetq,
|
||||||
|
/// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
|
||||||
|
/// text-generation-inference will use exllama (faster) kernels whereever possible, and use
|
||||||
|
/// triton kernel (wider support) when it's not.
|
||||||
|
/// AWQ has faster kernels.
|
||||||
|
Gptq,
|
||||||
|
/// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
|
||||||
|
/// but it is known that the model will be much slower to run than the native f16.
|
||||||
|
#[deprecated(since="1.1.0", note="Use `eetq` instead, which provides better latencies overall and is drop-in in most cases")]
|
||||||
|
Bitsandbytes,
|
||||||
|
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
|
||||||
|
/// but it is known that the model will be much slower to run than the native f16.
|
||||||
|
BitsandbytesNF4,
|
||||||
|
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
|
||||||
|
/// perplexity performance for you model
|
||||||
|
BitsandbytesFP4,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for Quantization {
|
impl std::fmt::Display for Quantization {
|
||||||
@ -47,6 +64,7 @@ impl std::fmt::Display for Quantization {
|
|||||||
}
|
}
|
||||||
Quantization::Awq => {
|
Quantization::Awq => {
|
||||||
write!(f, "awq")
|
write!(f, "awq")
|
||||||
|
}
|
||||||
Quantization::Eetq => {
|
Quantization::Eetq => {
|
||||||
write!(f, "eetq")
|
write!(f, "eetq")
|
||||||
}
|
}
|
||||||
@ -130,9 +148,7 @@ struct Args {
|
|||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
num_shard: Option<usize>,
|
num_shard: Option<usize>,
|
||||||
|
|
||||||
/// Whether you want the model to be quantized. This will use `bitsandbytes` for
|
/// Whether you want the model to be quantized.
|
||||||
/// quantization on the fly, or `gptq`. 4bit quantization is available through
|
|
||||||
/// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
|
|
||||||
#[clap(long, env, value_enum)]
|
#[clap(long, env, value_enum)]
|
||||||
quantize: Option<Quantization>,
|
quantize: Option<Quantization>,
|
||||||
|
|
||||||
|
1
server/.gitignore
vendored
1
server/.gitignore
vendored
@ -160,3 +160,4 @@ flash-attention/
|
|||||||
flash-attention-v2/
|
flash-attention-v2/
|
||||||
vllm/
|
vllm/
|
||||||
llm-awq/
|
llm-awq/
|
||||||
|
eetq/
|
||||||
|
@ -5,6 +5,8 @@ import torch.distributed
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from typing import List
|
from typing import List
|
||||||
|
from loguru import logger
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
HAS_BITS_AND_BYTES = True
|
HAS_BITS_AND_BYTES = True
|
||||||
try:
|
try:
|
||||||
@ -242,6 +244,10 @@ class Linear4bit(nn.Module):
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(1)
|
||||||
|
def warn_deprecate_bnb():
|
||||||
|
logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")
|
||||||
|
|
||||||
def get_linear(weight, bias, quantize):
|
def get_linear(weight, bias, quantize):
|
||||||
if quantize is None:
|
if quantize is None:
|
||||||
linear = FastLinear(weight, bias)
|
linear = FastLinear(weight, bias)
|
||||||
@ -251,8 +257,7 @@ def get_linear(weight, bias, quantize):
|
|||||||
else:
|
else:
|
||||||
raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
|
raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
|
||||||
elif quantize == "bitsandbytes":
|
elif quantize == "bitsandbytes":
|
||||||
import warnings
|
warn_deprecate_bnb()
|
||||||
warnings.warn("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce", DeprecationWarning)
|
|
||||||
linear = Linear8bitLt(
|
linear = Linear8bitLt(
|
||||||
weight,
|
weight,
|
||||||
bias,
|
bias,
|
||||||
|
Loading…
Reference in New Issue
Block a user