diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 3f1b37d6..54b8342b 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -221,7 +221,7 @@ struct Args { /// for users. The larger this value, the longer prompt users can send which /// can impact the overall memory required to handle the load. /// Please note that some models have a finite range of sequence they can handle. - /// Default to min(max_position_embeddings - 1, 13383) + /// Default to min(max_position_embeddings - 1, 4095) #[clap(long, env)] max_input_tokens: Option, @@ -237,7 +237,7 @@ struct Args { /// `1511` max_new_tokens. /// The larger this value, the larger amount each request will be in your RAM /// and the less effective batching can be. - /// Default to min(max_position_embeddings, 16384) + /// Default to min(max_position_embeddings, 4096) #[clap(long, env)] max_total_tokens: Option, @@ -257,7 +257,7 @@ struct Args { /// Limits the number of tokens for the prefill operation. /// Since this operation take the most memory and is compute bound, it is interesting /// to limit the number of requests that can be sent. - /// Default to min(max_input_length + 50, 16384) to give a bit of room. + /// Default to `max_input_length + 50` to give a bit of room. #[clap(long, env)] max_batch_prefill_tokens: Option, @@ -1294,11 +1294,7 @@ fn main() -> Result<(), LauncherError> { let config: Config = serde_json::from_str(&content)?; // Quantization usually means you're even more RAM constrained. - let max_default = if args.quantize.is_some() { - 4096 - } else { - 2usize.pow(14) - }; + let max_default = 4096; let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) { (Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {