mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
doc: Add metrics documentation and add a 'Reference' section
This commit is contained in:
parent
6e127dcc96
commit
fc2d1134b8
@ -11,7 +11,7 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
|
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
|
||||||
with:
|
with:
|
||||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||||
pr_number: ${{ github.event.number }}
|
pr_number: ${{ github.event.number }}
|
||||||
|
@ -5,7 +5,7 @@ repos:
|
|||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
exclude: docs/source/basic_tutorials/launcher.md
|
exclude: docs/source/reference/launcher.md
|
||||||
- repo: https://github.com/psf/black
|
- repo: https://github.com/psf/black
|
||||||
rev: 24.2.0
|
rev: 24.2.0
|
||||||
hooks:
|
hooks:
|
||||||
|
@ -33,8 +33,6 @@
|
|||||||
title: Serving Private & Gated Models
|
title: Serving Private & Gated Models
|
||||||
- local: basic_tutorials/using_cli
|
- local: basic_tutorials/using_cli
|
||||||
title: Using TGI CLI
|
title: Using TGI CLI
|
||||||
- local: basic_tutorials/launcher
|
|
||||||
title: All TGI CLI options
|
|
||||||
- local: basic_tutorials/non_core_models
|
- local: basic_tutorials/non_core_models
|
||||||
title: Non-core Model Serving
|
title: Non-core Model Serving
|
||||||
- local: basic_tutorials/safety
|
- local: basic_tutorials/safety
|
||||||
@ -48,6 +46,16 @@
|
|||||||
- local: basic_tutorials/train_medusa
|
- local: basic_tutorials/train_medusa
|
||||||
title: Train Medusa
|
title: Train Medusa
|
||||||
title: Tutorials
|
title: Tutorials
|
||||||
|
- sections:
|
||||||
|
- local: architecture
|
||||||
|
title: Internal Architecture
|
||||||
|
- local: reference/launcher
|
||||||
|
title: All TGI CLI options
|
||||||
|
- local: messages_api
|
||||||
|
title: Messages API
|
||||||
|
- local: reference/metrics
|
||||||
|
title: Exported Metrics
|
||||||
|
title: Reference
|
||||||
- sections:
|
- sections:
|
||||||
- local: conceptual/streaming
|
- local: conceptual/streaming
|
||||||
title: Streaming
|
title: Streaming
|
||||||
@ -64,7 +72,7 @@
|
|||||||
- local: conceptual/speculation
|
- local: conceptual/speculation
|
||||||
title: Speculation (Medusa, ngram)
|
title: Speculation (Medusa, ngram)
|
||||||
- local: conceptual/guidance
|
- local: conceptual/guidance
|
||||||
title: How Guidance Works (via outlines
|
title: How Guidance Works (via outlines)
|
||||||
- local: conceptual/lora
|
- local: conceptual/lora
|
||||||
title: LoRA (Low-Rank Adaptation)
|
title: LoRA (Low-Rank Adaptation)
|
||||||
|
|
||||||
|
30
docs/source/reference/metrics.md
Normal file
30
docs/source/reference/metrics.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# Metrics
|
||||||
|
|
||||||
|
TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
|
||||||
|
These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
|
||||||
|
|
||||||
|
The following metrics are exposed:
|
||||||
|
|
||||||
|
| Metric Name | Description | Type | Unit |
|
||||||
|
|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
|
||||||
|
| `tgi_batch_current_max_tokens` | Maximum tokens for the current batch | Gauge | Count |
|
||||||
|
| `tgi_batch_current_size` | Current batch size | Gauge | Count |
|
||||||
|
| `tgi_batch_decode_duration` | Time spent decoding a batch per method (prefill or decode) | Histogram | Seconds |
|
||||||
|
| `tgi_batch_filter_duration` | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
|
||||||
|
| `tgi_batch_forward_duration` | Batch forward duration per method (prefill or decode) | Histogram | Seconds |
|
||||||
|
| `tgi_batch_inference_count` | Inference calls per method (prefill or decode) | Counter | Count |
|
||||||
|
| `tgi_batch_inference_duration` | Batch inference duration | Histogram | Seconds |
|
||||||
|
| `tgi_batch_inference_success` | Number of successful inference calls per method (prefill or decode) | Counter | Count |
|
||||||
|
| `tgi_batch_next_size` | Batch size of the next batch | Histogram | Count |
|
||||||
|
| `tgi_queue_size` | Current queue size | Gauge | Count |
|
||||||
|
| `tgi_request_count` | Total number of requests | Counter | Count |
|
||||||
|
| `tgi_request_duration` | Total time spent processing the request (e2e latency) | Histogram | Seconds |
|
||||||
|
| `tgi_request_generated_tokens` | Generated tokens per request | Histogram | Count |
|
||||||
|
| `tgi_request_inference_duration` | Request inference duration | Histogram | Seconds |
|
||||||
|
| `tgi_request_input_length` | Input token length per request | Histogram | Count |
|
||||||
|
| `tgi_request_max_new_tokens` | Maximum new tokens per request | Histogram | Count |
|
||||||
|
| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency) | Histogram | Seconds |
|
||||||
|
| `tgi_request_queue_duration` | Time spent in the queue per request | Histogram | Seconds |
|
||||||
|
| `tgi_request_skipped_tokens` | Speculated tokens per request | Histogram | Count |
|
||||||
|
| `tgi_request_success` | Number of successful requests | Counter | |
|
||||||
|
| `tgi_request_validation_duration` | Time spent validating the request | Histogram | Seconds |
|
@ -1973,6 +1973,120 @@ async fn start(
|
|||||||
.install_recorder()
|
.install_recorder()
|
||||||
.expect("failed to install metrics recorder");
|
.expect("failed to install metrics recorder");
|
||||||
|
|
||||||
|
// Metrics descriptions
|
||||||
|
metrics::describe_counter!("tgi_request_success", "Number of successful requests");
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Request duration"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_validation_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Request validation duration"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_queue_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Request queue duration"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_inference_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Request inference duration"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_mean_time_per_token_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Mean time per token per request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_generated_tokens",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Generated tokens per request"
|
||||||
|
);
|
||||||
|
metrics::describe_counter!(
|
||||||
|
"tgi_batch_inference_count",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Inference calls per method (prefill or decode)"
|
||||||
|
);
|
||||||
|
metrics::describe_counter!(
|
||||||
|
"tgi_request_count",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Total number of requests"
|
||||||
|
);
|
||||||
|
metrics::describe_counter!(
|
||||||
|
"tgi_batch_inference_success",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Number of successful inference calls per method (prefill or decode)"
|
||||||
|
);
|
||||||
|
metrics::describe_gauge!(
|
||||||
|
"tgi_batch_current_size",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Current batch size"
|
||||||
|
);
|
||||||
|
metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
|
||||||
|
metrics::describe_gauge!(
|
||||||
|
"tgi_batch_current_max_tokens",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Maximum tokens for the current batch"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_max_new_tokens",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Maximum new tokens per request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_batch_inference_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Batch inference duration"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_batch_forward_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Batch forward duration per method (prefill or decode)"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_skipped_tokens",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Speculated tokens per request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_batch_filter_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Time spent filtering batches and sending generated tokens per method (prefill or decode)"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_queue_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Time spent in the queue per request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_validation_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Time spent validating the request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Total time spent processing the request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_batch_decode_duration",
|
||||||
|
metrics::Unit::Seconds,
|
||||||
|
"Time spent decoding a batch per method (prefill or decode)"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_request_input_length",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Input token length per request"
|
||||||
|
);
|
||||||
|
metrics::describe_histogram!(
|
||||||
|
"tgi_batch_next_size",
|
||||||
|
metrics::Unit::Count,
|
||||||
|
"Batch size of the next batch"
|
||||||
|
);
|
||||||
|
|
||||||
// CORS layer
|
// CORS layer
|
||||||
let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
|
let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
|
||||||
let cors_layer = CorsLayer::new()
|
let cors_layer = CorsLayer::new()
|
||||||
|
@ -63,7 +63,7 @@ def check_cli(check: bool):
|
|||||||
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
|
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
|
||||||
block = []
|
block = []
|
||||||
|
|
||||||
filename = "docs/source/basic_tutorials/launcher.md"
|
filename = "docs/source/reference/launcher.md"
|
||||||
if check:
|
if check:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
doc = f.read()
|
doc = f.read()
|
||||||
|
Loading…
Reference in New Issue
Block a user