mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 06:12:07 +00:00
Marking the flag as really not the fastest and BETA.
This commit is contained in:
parent
6568e4812f
commit
eb40f8ccda
@ -47,8 +47,11 @@ enum Quantization {
|
||||
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
|
||||
/// perplexity performance for you model
|
||||
BitsandbytesFP4,
|
||||
/// [BETA]
|
||||
/// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
|
||||
/// This dtype has native ops should be the fastest if available.
|
||||
/// This is currently not the fastest because of local unpacking + padding to satisfy matrix
|
||||
/// multiplication limitations.
|
||||
Fp8,
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user