From eb40f8ccdad5d22f3d0b77680c3848901971168c Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 1 Feb 2024 15:30:02 +0000 Subject: [PATCH] Marking the flag as really not the fastest and BETA. --- launcher/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 5fee3e912..9c8abf8f5 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -47,8 +47,11 @@ enum Quantization { /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better /// perplexity performance for you model BitsandbytesFP4, + /// [BETA] /// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above /// This dtype has native ops should be the fastest if available. + /// This is currently not the fastest because of local unpacking + padding to satisfy matrix + /// multiplication limitations. Fp8, }