diff --git a/launcher/src/main.rs b/launcher/src/main.rs index cd4b2231..321d7c69 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -702,8 +702,8 @@ struct Args { /// Overall this number should be the largest possible amount that fits the /// remaining memory (after the model is loaded). Since the actual memory overhead /// depends on other parameters like if you're using quantization, flash attention - /// or the model implementation, text-generation-inference cannot infer this number - /// automatically. + /// or the model implementation, text-generation-inference infers this number automatically + /// if not provided ensuring that the value is as large as possible. #[clap(long, env)] max_batch_total_tokens: Option,