diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 1b8c4c5d..5548773b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -20,7 +20,7 @@ struct Args { #[clap(default_value = "main", long, env)] revision: String, - /// Path to the GGUF model file to be used for inference. + /// Path to the GGUF model file for inference. #[clap(long, env)] model_gguf: String, // TODO Option() with hf->gguf & quantize @@ -48,15 +48,15 @@ struct Args { #[clap(default_value = "-1.0", long, env)] defrag_threshold: f32, - /// Setup NUMA optimizations. + /// Enable NUMA optimizations. #[clap(default_value = "disabled", value_enum, long, env)] numa: LlamacppNuma, - /// Whether to use memory mapping. + /// Use memory mapping for the model. #[clap(default_value = "true", long, env)] use_mmap: bool, - /// Whether to use memory locking. + /// Use memory locking to prevent swapping. #[clap(default_value = "false", long, env)] use_mlock: bool, @@ -68,95 +68,95 @@ struct Args { #[clap(default_value = "true", long, env)] flash_attention: bool, - /// Use data type for K cache. + /// Data type used for K cache. #[clap(default_value = "f16", value_enum, long, env)] type_k: LlamacppGGMLType, - /// Use data type for V cache. + /// Data type used for V cache. #[clap(default_value = "f16", value_enum, long, env)] type_v: LlamacppGGMLType, - /// TODO + /// Number of tokenizer workers used for payload validation and truncation. #[clap(default_value = "2", long, env)] validation_workers: usize, + + /// Maximum amount of concurrent requests. #[clap(default_value = "128", long, env)] max_concurrent_requests: usize, - #[clap(default_value = "2", long, env)] - max_best_of: usize, - #[clap(default_value = "4", long, env)] - max_stop_sequences: usize, - #[clap(default_value = "5", long, env)] - max_top_n_tokens: u32, - /// Maximum number of input tokens allowed per request. + /// Maximum number of input tokens per request. #[clap(default_value = "1024", long, env)] max_input_tokens: usize, - /// Maximum total tokens (input + output) allowed per request. + /// Maximum total tokens (input + output) per request. #[clap(default_value = "2048", long, env)] max_total_tokens: usize, -// #[clap(default_value = "1.2", long, env)] -// waiting_served_ratio: f32, -// #[clap(default_value = "4096", long, env)] -// max_batch_prefill_tokens: u32, - - /// Maximum number of tokens that can be submitted within a batch + /// Maximum number of tokens in a batch. #[clap(default_value = "4096", long, env)] max_batch_total_tokens: usize, - /// Maximum number of tokens within a batch + /// Maximum number of tokens in a physical batch. #[clap(long, env)] max_physical_batch_total_tokens: Option, -// #[clap(default_value = "20", long, env)] -// max_waiting_tokens: usize, - - /// Maximum number of requests per batch + /// Maximum number of requests per batch. #[clap(default_value = "1", long, env)] max_batch_size: usize, - /// The IP address to listen on + /// IP address to listen on. #[clap(default_value = "0.0.0.0", long, env)] hostname: String, - /// The port to listen on. + /// Port to listen on. #[clap(default_value = "3001", long, short, env)] port: u16, -// #[clap(default_value = "/tmp/text-generation-server-0", long, env)] -// master_shard_uds_path: String, -// #[clap(long, env)] -// tokenizer_name: String, -// #[clap(long, env)] -// tokenizer_config_path: Option, -// #[clap(long, env, value_enum)] -// trust_remote_code: bool, -// #[clap(long, env)] -// api_key: Option, - + /// Enable JSON output format. #[clap(long, env)] json_output: bool, + + /// OTLP endpoint for telemetry data. #[clap(long, env)] otlp_endpoint: Option, + + /// Service name for OTLP telemetry. #[clap(default_value = "text-generation-inference.router", long, env)] otlp_service_name: String, + + /// Allowed origins for CORS. #[clap(long, env)] cors_allow_origin: Option>, + + /// Enable Ngrok tunneling. #[clap(long, env)] ngrok: bool, + + /// Ngrok authentication token. #[clap(long, env)] ngrok_authtoken: Option, + + /// Ngrok edge to use for tunneling. #[clap(long, env)] ngrok_edge: Option, + + /// Path to the tokenizer configuration file. #[clap(long, env)] tokenizer_config_path: Option, + + /// Disable grammar support. #[clap(long, env, default_value_t = false)] disable_grammar_support: bool, + + /// Maximum number of inputs per request. #[clap(default_value = "4", long, env)] max_client_batch_size: usize, + + /// Level of usage statistics collection. #[clap(default_value = "on", long, env)] usage_stats: usage_stats::UsageStatsLevel, + + /// Maximum payload size limit in bytes. #[clap(default_value = "2000000", long, env)] payload_limit: usize, } @@ -257,9 +257,9 @@ async fn main() -> Result<(), RouterError> { server::run( backend, args.max_concurrent_requests, - args.max_best_of, - args.max_stop_sequences, - args.max_top_n_tokens, + 0, // max_best_of + 0, // max_stop_sequences + 0, // max_top_n_tokens args.max_input_tokens, args.max_total_tokens, args.validation_workers,