Marking the flag as really not the fastest and BETA.

2025-07-24 17:00:18 +00:00 · 2024-02-01 15:30:02 +00:00 · 2024-02-01 15:30:02 +00:00 · eb40f8ccda
commit eb40f8ccda
parent 6568e4812f
1 changed files with 3 additions and 0 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -47,8 +47,11 @@ enum Quantization {
    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
    /// perplexity performance for you model
    BitsandbytesFP4,
+    /// [BETA]
    /// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
    /// This dtype has native ops should be the fastest if available.
+    /// This is currently not the fastest because of local unpacking + padding to satisfy matrix
+    /// multiplication limitations.
    Fp8,
 }