mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-26 12:32:10 +00:00
Add option to configure prometheus port (#3187)
* add prometheus port * fix doc * add port for trtllm and llamacpp * Fixing format after rebase. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
parent
8f8819795f
commit
02715dc53f
@ -119,6 +119,9 @@ struct Args {
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
port: u16,
|
||||
|
||||
#[clap(default_value = "9000", long, short, env)]
|
||||
prometheus_port: u16,
|
||||
|
||||
/// Enable JSON output format.
|
||||
#[clap(long, env)]
|
||||
json_output: bool,
|
||||
@ -317,6 +320,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
args.max_client_batch_size,
|
||||
args.usage_stats,
|
||||
args.payload_limit,
|
||||
args.prometheus_port,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
|
@ -37,6 +37,8 @@ struct Args {
|
||||
hostname: String,
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
port: u16,
|
||||
#[clap(default_value = "9000", long, short, env)]
|
||||
prometheus_port: u16,
|
||||
#[clap(long, env, required = true)]
|
||||
tokenizer_name: String,
|
||||
#[clap(long, env)]
|
||||
@ -227,6 +229,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
||||
max_batch_total_tokens,
|
||||
hostname,
|
||||
port,
|
||||
prometheus_port,
|
||||
tokenizer_name,
|
||||
tokenizer_config_path,
|
||||
revision,
|
||||
@ -322,6 +325,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
||||
max_client_batch_size,
|
||||
usage_stats,
|
||||
payload_limit,
|
||||
prometheus_port,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
|
@ -36,6 +36,8 @@ struct Args {
|
||||
hostname: String,
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
port: u16,
|
||||
#[clap(default_value = "9000", long, short, env)]
|
||||
prometheus_port: u16,
|
||||
#[clap(default_value = "/tmp/text-generation-server-0", long, env)]
|
||||
master_shard_uds_path: String,
|
||||
#[clap(default_value = "bigscience/bloom", long, env)]
|
||||
@ -99,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
max_batch_size,
|
||||
hostname,
|
||||
port,
|
||||
prometheus_port,
|
||||
master_shard_uds_path,
|
||||
tokenizer_name,
|
||||
tokenizer_config_path,
|
||||
@ -198,6 +201,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
max_client_batch_size,
|
||||
usage_stats,
|
||||
payload_limit,
|
||||
prometheus_port,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
|
@ -36,6 +36,8 @@ struct Args {
|
||||
hostname: String,
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
port: u16,
|
||||
#[clap(default_value = "9000", long, short, env)]
|
||||
prometheus_port: u16,
|
||||
#[clap(default_value = "/tmp/text-generation-server-0", long, env)]
|
||||
master_shard_uds_path: String,
|
||||
#[clap(default_value = "bigscience/bloom", long, env)]
|
||||
@ -99,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
max_batch_size,
|
||||
hostname,
|
||||
port,
|
||||
prometheus_port,
|
||||
master_shard_uds_path,
|
||||
tokenizer_name,
|
||||
tokenizer_config_path,
|
||||
@ -214,6 +217,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
max_client_batch_size,
|
||||
usage_stats,
|
||||
payload_limit,
|
||||
prometheus_port,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
|
@ -251,6 +251,15 @@ Options:
|
||||
[env: PORT=]
|
||||
[default: 3000]
|
||||
|
||||
```
|
||||
## PROMETHEUS_PORT
|
||||
```shell
|
||||
-p, --prometheus-port <PROMETHEUS_PORT>
|
||||
The Prometheus port to listen on
|
||||
|
||||
[env: PROMETHEUS_PORT=]
|
||||
[default: 9000]
|
||||
|
||||
```
|
||||
## SHARD_UDS_PATH
|
||||
```shell
|
||||
|
@ -773,6 +773,10 @@ struct Args {
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
port: u16,
|
||||
|
||||
/// The Prometheus port to listen on.
|
||||
#[clap(default_value = "9000", long, short, env)]
|
||||
prometheus_port: u16,
|
||||
|
||||
/// The name of the socket for gRPC communication between the webserver
|
||||
/// and the shards.
|
||||
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
||||
@ -1848,6 +1852,8 @@ fn spawn_webserver(
|
||||
args.hostname.to_string(),
|
||||
"--port".to_string(),
|
||||
args.port.to_string(),
|
||||
"--prometheus-port".to_string(),
|
||||
args.prometheus_port.to_string(),
|
||||
"--master-shard-uds-path".to_string(),
|
||||
format!("{}-0", args.shard_uds_path),
|
||||
"--tokenizer-name".to_string(),
|
||||
|
@ -1522,6 +1522,7 @@ pub async fn run(
|
||||
max_client_batch_size: usize,
|
||||
usage_stats_level: usage_stats::UsageStatsLevel,
|
||||
payload_limit: usize,
|
||||
prometheus_port: u16,
|
||||
) -> Result<(), WebServerError> {
|
||||
// CORS allowed origins
|
||||
// map to go inside the option and then map to parse from String to HeaderValue
|
||||
@ -1825,6 +1826,7 @@ pub async fn run(
|
||||
compat_return_full_text,
|
||||
allow_origin,
|
||||
payload_limit,
|
||||
prometheus_port,
|
||||
)
|
||||
.await;
|
||||
|
||||
@ -1886,6 +1888,7 @@ async fn start(
|
||||
compat_return_full_text: bool,
|
||||
allow_origin: Option<AllowOrigin>,
|
||||
payload_limit: usize,
|
||||
prometheus_port: u16,
|
||||
) -> Result<(), WebServerError> {
|
||||
// Determine the server port based on the feature and environment variable.
|
||||
let port = if cfg!(feature = "google") {
|
||||
@ -1959,8 +1962,12 @@ async fn start(
|
||||
// let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens"));
|
||||
// let skipped_buckets: Vec<f64> = (0..shard_info.speculate + 1).map(|x| x as f64).collect();
|
||||
|
||||
let mut p_addr = addr;
|
||||
p_addr.set_port(prometheus_port);
|
||||
|
||||
// Prometheus handler
|
||||
let builder = PrometheusBuilder::new()
|
||||
.with_http_listener(p_addr)
|
||||
.set_buckets_for_metric(duration_matcher, &duration_buckets)
|
||||
.unwrap()
|
||||
.set_buckets_for_metric(input_length_matcher, &input_length_buckets)
|
||||
|
Loading…
Reference in New Issue
Block a user