mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
Removing serde override.
This commit is contained in:
parent
13350a330f
commit
3643d1cd9e
@ -89,10 +89,10 @@ enum Quantization {
|
|||||||
Bitsandbytes,
|
Bitsandbytes,
|
||||||
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
|
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
|
||||||
/// but it is known that the model will be much slower to run than the native f16.
|
/// but it is known that the model will be much slower to run than the native f16.
|
||||||
BitsandbytesNF4,
|
BitsandbytesNf4,
|
||||||
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
|
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
|
||||||
/// perplexity performance for you model
|
/// perplexity performance for you model
|
||||||
BitsandbytesFP4,
|
BitsandbytesFp4,
|
||||||
/// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
|
/// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
|
||||||
/// This dtype has native ops should be the fastest if available.
|
/// This dtype has native ops should be the fastest if available.
|
||||||
/// This is currently not the fastest because of local unpacking + padding to satisfy matrix
|
/// This is currently not the fastest because of local unpacking + padding to satisfy matrix
|
||||||
@ -109,10 +109,10 @@ impl std::fmt::Display for Quantization {
|
|||||||
Quantization::Bitsandbytes => {
|
Quantization::Bitsandbytes => {
|
||||||
write!(f, "bitsandbytes")
|
write!(f, "bitsandbytes")
|
||||||
}
|
}
|
||||||
Quantization::BitsandbytesNF4 => {
|
Quantization::BitsandbytesNf4 => {
|
||||||
write!(f, "bitsandbytes-nf4")
|
write!(f, "bitsandbytes-nf4")
|
||||||
}
|
}
|
||||||
Quantization::BitsandbytesFP4 => {
|
Quantization::BitsandbytesFp4 => {
|
||||||
write!(f, "bitsandbytes-fp4")
|
write!(f, "bitsandbytes-fp4")
|
||||||
}
|
}
|
||||||
Quantization::Exl2 => {
|
Quantization::Exl2 => {
|
||||||
@ -1566,8 +1566,8 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
None,
|
None,
|
||||||
Some(
|
Some(
|
||||||
Quantization::Bitsandbytes
|
Quantization::Bitsandbytes
|
||||||
| Quantization::BitsandbytesNF4
|
| Quantization::BitsandbytesNf4
|
||||||
| Quantization::BitsandbytesFP4,
|
| Quantization::BitsandbytesFp4,
|
||||||
),
|
),
|
||||||
) => {
|
) => {
|
||||||
tracing::info!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
|
tracing::info!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
|
||||||
|
Loading…
Reference in New Issue
Block a user