diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index b99e9591..9ee61ce6 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -119,6 +119,9 @@ struct Args { #[clap(default_value = "3000", long, short, env)] port: u16, + #[clap(default_value = "9000", long, short, env)] + prometheus_port: u16, + /// Enable JSON output format. #[clap(long, env)] json_output: bool, @@ -317,6 +320,7 @@ async fn main() -> Result<(), RouterError> { args.max_client_batch_size, args.usage_stats, args.payload_limit, + args.prometheus_port, ) .await?; Ok(()) diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 9d4bf8f2..543f8e6e 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -37,6 +37,8 @@ struct Args { hostname: String, #[clap(default_value = "3000", long, short, env)] port: u16, + #[clap(default_value = "9000", long, short, env)] + prometheus_port: u16, #[clap(long, env, required = true)] tokenizer_name: String, #[clap(long, env)] @@ -227,6 +229,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_batch_total_tokens, hostname, port, + prometheus_port, tokenizer_name, tokenizer_config_path, revision, @@ -322,6 +325,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_client_batch_size, usage_stats, payload_limit, + prometheus_port, ) .await?; Ok(()) diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs index f537690e..60b5d52b 100644 --- a/backends/v2/src/main.rs +++ b/backends/v2/src/main.rs @@ -36,6 +36,8 @@ struct Args { hostname: String, #[clap(default_value = "3000", long, short, env)] port: u16, + #[clap(default_value = "9000", long, short, env)] + prometheus_port: u16, #[clap(default_value = "/tmp/text-generation-server-0", long, env)] master_shard_uds_path: String, #[clap(default_value = "bigscience/bloom", long, env)] @@ -99,6 +101,7 @@ async fn main() -> Result<(), RouterError> { max_batch_size, hostname, port, + prometheus_port, master_shard_uds_path, tokenizer_name, tokenizer_config_path, @@ -198,6 +201,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + prometheus_port, ) .await?; Ok(()) diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs index 52e41b55..44e63853 100644 --- a/backends/v3/src/main.rs +++ b/backends/v3/src/main.rs @@ -36,6 +36,8 @@ struct Args { hostname: String, #[clap(default_value = "3000", long, short, env)] port: u16, + #[clap(default_value = "9000", long, short, env)] + prometheus_port: u16, #[clap(default_value = "/tmp/text-generation-server-0", long, env)] master_shard_uds_path: String, #[clap(default_value = "bigscience/bloom", long, env)] @@ -99,6 +101,7 @@ async fn main() -> Result<(), RouterError> { max_batch_size, hostname, port, + prometheus_port, master_shard_uds_path, tokenizer_name, tokenizer_config_path, @@ -214,6 +217,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + prometheus_port, ) .await?; Ok(()) diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md index 6505a08d..51bd461f 100644 --- a/docs/source/reference/launcher.md +++ b/docs/source/reference/launcher.md @@ -251,6 +251,15 @@ Options: [env: PORT=] [default: 3000] +``` +## PROMETHEUS_PORT +```shell + -p, --prometheus-port + The Prometheus port to listen on + + [env: PROMETHEUS_PORT=] + [default: 9000] + ``` ## SHARD_UDS_PATH ```shell diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 2fbb9c12..a82ad12f 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -773,6 +773,10 @@ struct Args { #[clap(default_value = "3000", long, short, env)] port: u16, + /// The Prometheus port to listen on. + #[clap(default_value = "9000", long, short, env)] + prometheus_port: u16, + /// The name of the socket for gRPC communication between the webserver /// and the shards. #[clap(default_value = "/tmp/text-generation-server", long, env)] @@ -1848,6 +1852,8 @@ fn spawn_webserver( args.hostname.to_string(), "--port".to_string(), args.port.to_string(), + "--prometheus-port".to_string(), + args.prometheus_port.to_string(), "--master-shard-uds-path".to_string(), format!("{}-0", args.shard_uds_path), "--tokenizer-name".to_string(), diff --git a/router/src/server.rs b/router/src/server.rs index 22fad04b..98c88ec1 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1522,6 +1522,7 @@ pub async fn run( max_client_batch_size: usize, usage_stats_level: usage_stats::UsageStatsLevel, payload_limit: usize, + prometheus_port: u16, ) -> Result<(), WebServerError> { // CORS allowed origins // map to go inside the option and then map to parse from String to HeaderValue @@ -1825,6 +1826,7 @@ pub async fn run( compat_return_full_text, allow_origin, payload_limit, + prometheus_port, ) .await; @@ -1886,6 +1888,7 @@ async fn start( compat_return_full_text: bool, allow_origin: Option, payload_limit: usize, + prometheus_port: u16, ) -> Result<(), WebServerError> { // Determine the server port based on the feature and environment variable. let port = if cfg!(feature = "google") { @@ -1959,8 +1962,12 @@ async fn start( // let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens")); // let skipped_buckets: Vec = (0..shard_info.speculate + 1).map(|x| x as f64).collect(); + let mut p_addr = addr; + p_addr.set_port(prometheus_port); + // Prometheus handler let builder = PrometheusBuilder::new() + .with_http_listener(p_addr) .set_buckets_for_metric(duration_matcher, &duration_buckets) .unwrap() .set_buckets_for_metric(input_length_matcher, &input_length_buckets)