diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 146d83d6..35872867 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -22,6 +22,8 @@ mod env_runtime; #[derive(Clone, Copy, Debug, ValueEnum)] enum Quantization { Bitsandbytes, + BitsandbytesNF4, + BitsandbytesFP4, Gptq, } @@ -32,6 +34,12 @@ impl std::fmt::Display for Quantization { Quantization::Bitsandbytes => { write!(f, "bitsandbytes") } + Quantization::BitsandbytesNF4 => { + write!(f, "bitsandbytes-nf4") + } + Quantization::BitsandbytesFP4 => { + write!(f, "bitsandbytes-fp4") + } Quantization::Gptq => { write!(f, "gptq") } @@ -116,7 +124,7 @@ struct Args { num_shard: Option, /// Whether you want the model to be quantized. This will use `bitsandbytes` for - /// quantization on the fly, or `gptq`. + /// quantization on the fly, or `gptq`. #[clap(long, env, value_enum)] quantize: Option, diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index eba807bc..459ba8c4 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -14,6 +14,8 @@ app = typer.Typer() class Quantization(str, Enum): bitsandbytes = "bitsandbytes" + bitsandbytes_nf4 = "bitsandbytes-nf4" + bitsandbytes_fp4 = "bitsandbytes-fp4" gptq = "gptq"