doc: Add metrics documentation and add a 'Reference' section

2025-09-12 12:54:52 +00:00 · 2024-07-15 14:15:55 +02:00 · 2024-07-15 14:15:55 +02:00 · fc2d1134b8
commit fc2d1134b8
parent 6e127dcc96
7 changed files with 158 additions and 6 deletions
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@ -11,7 +11,7 @@ concurrency:

 jobs:
  build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,7 +5,7 @@ repos:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
-        exclude: docs/source/basic_tutorials/launcher.md
+        exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black
    rev: 24.2.0
    hooks:
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -33,8 +33,6 @@
    title: Serving Private & Gated Models
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
  - local: basic_tutorials/safety
@ -48,6 +46,16 @@
  - local: basic_tutorials/train_medusa
    title: Train Medusa
  title: Tutorials
+- sections:
+  - local: architecture
+    title: Internal Architecture
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: messages_api
+    title: Messages API
+  - local: reference/metrics
+    title: Exported Metrics
+  title: Reference
 - sections:
  - local: conceptual/streaming
    title: Streaming
@ -64,7 +72,7 @@
  - local: conceptual/speculation
    title: Speculation (Medusa, ngram)
  - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
  - local: conceptual/lora
    title: LoRA (Low-Rank Adaptation)

--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
--- a/docs/source/reference/metrics.md
+++ b/docs/source/reference/metrics.md
@ -0,0 +1,30 @@
+# Metrics
+
+TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
+These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
+
+The following metrics are exposed:
+
+| Metric Name                                | Description                                                                              | Type      | Unit    |
+|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
+| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
+| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
+| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
+| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
+| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
+| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
+| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
+| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
+| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
+| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
+| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
+| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
+| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
+| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
+| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
+| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
+| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
+| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
+| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
+| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
+| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1973,6 +1973,120 @@ async fn start(
        .install_recorder()
        .expect("failed to install metrics recorder");

+    // Metrics descriptions
+    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Request duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Request validation duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Request queue duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_inference_duration",
+        metrics::Unit::Seconds,
+        "Request inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_mean_time_per_token_duration",
+        metrics::Unit::Seconds,
+        "Mean time per token per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_generated_tokens",
+        metrics::Unit::Count,
+        "Generated tokens per request"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_count",
+        metrics::Unit::Count,
+        "Inference calls per method (prefill or decode)"
+    );
+    metrics::describe_counter!(
+        "tgi_request_count",
+        metrics::Unit::Count,
+        "Total number of requests"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_success",
+        metrics::Unit::Count,
+        "Number of successful inference calls per method (prefill or decode)"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_current_size",
+        metrics::Unit::Count,
+        "Current batch size"
+    );
+    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
+    metrics::describe_gauge!(
+        "tgi_batch_current_max_tokens",
+        metrics::Unit::Count,
+        "Maximum tokens for the current batch"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_max_new_tokens",
+        metrics::Unit::Count,
+        "Maximum new tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_inference_duration",
+        metrics::Unit::Seconds,
+        "Batch inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_forward_duration",
+        metrics::Unit::Seconds,
+        "Batch forward duration per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_skipped_tokens",
+        metrics::Unit::Count,
+        "Speculated tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_filter_duration",
+        metrics::Unit::Seconds,
+        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Time spent in the queue per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Time spent validating the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Total time spent processing the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_decode_duration",
+        metrics::Unit::Seconds,
+        "Time spent decoding a batch per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_input_length",
+        metrics::Unit::Count,
+        "Input token length per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_next_size",
+        metrics::Unit::Count,
+        "Batch size of the next batch"
+    );
+
    // CORS layer
    let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
    let cors_layer = CorsLayer::new()
--- a/update_doc.py
+++ b/update_doc.py
@ -63,7 +63,7 @@ def check_cli(check: bool):
    final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
    block = []

-    filename = "docs/source/basic_tutorials/launcher.md"
+    filename = "docs/source/reference/launcher.md"
    if check:
        with open(filename, "r") as f:
            doc = f.read()