From fc2d1134b8c3cf94c7e568e33443e0f314bfdcae Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Mon, 15 Jul 2024 14:15:55 +0200
Subject: [PATCH] doc: Add metrics documentation and add a 'Reference' section

---
 .github/workflows/build_pr_documentation.yaml |   2 +-
 .pre-commit-config.yaml                       |   2 +-
 docs/source/_toctree.yml                      |  14 ++-
 .../launcher.md                               |   0
 docs/source/reference/metrics.md              |  30 +++++
 router/src/server.rs                          | 114 ++++++++++++++++++
 update_doc.py                                 |   2 +-
 7 files changed, 158 insertions(+), 6 deletions(-)
 rename docs/source/{basic_tutorials => reference}/launcher.md (100%)
 create mode 100644 docs/source/reference/metrics.md

diff --git a/.github/workflows/build_pr_documentation.yaml b/.github/workflows/build_pr_documentation.yaml
index bf03bfdf..a5ce39a5 100644
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@@ -11,7 +11,7 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6f5e685e..0c8b6885 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
-        exclude: docs/source/basic_tutorials/launcher.md
+        exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black
     rev: 24.2.0
     hooks:
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index e97c00aa..3acfbc89 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -33,8 +33,6 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
   - local: basic_tutorials/non_core_models
     title: Non-core Model Serving
   - local: basic_tutorials/safety
@@ -48,6 +46,16 @@
   - local: basic_tutorials/train_medusa
     title: Train Medusa
   title: Tutorials
+- sections:
+  - local: architecture
+    title: Internal Architecture
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: messages_api
+    title: Messages API
+  - local: reference/metrics
+    title: Exported Metrics
+  title: Reference
 - sections:
   - local: conceptual/streaming
     title: Streaming
@@ -64,7 +72,7 @@
   - local: conceptual/speculation
     title: Speculation (Medusa, ngram)
   - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
   - local: conceptual/lora
     title: LoRA (Low-Rank Adaptation)
 
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/reference/launcher.md
similarity index 100%
rename from docs/source/basic_tutorials/launcher.md
rename to docs/source/reference/launcher.md
diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md
new file mode 100644
index 00000000..d34d38ea
--- /dev/null
+++ b/docs/source/reference/metrics.md
@@ -0,0 +1,30 @@
+# Metrics
+
+TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
+These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
+
+The following metrics are exposed:
+
+| Metric Name                                | Description                                                                              | Type      | Unit    |
+|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
+| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
+| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
+| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
+| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
+| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
+| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
+| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
+| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
+| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
+| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
+| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
+| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
+| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
+| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
+| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
+| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
+| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
+| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
+| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
+| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
+| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |
diff --git a/router/src/server.rs b/router/src/server.rs
index 1d1cd36a..468899ae 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1973,6 +1973,120 @@ async fn start(
         .install_recorder()
         .expect("failed to install metrics recorder");
 
+    // Metrics descriptions
+    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Request duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Request validation duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Request queue duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_inference_duration",
+        metrics::Unit::Seconds,
+        "Request inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_mean_time_per_token_duration",
+        metrics::Unit::Seconds,
+        "Mean time per token per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_generated_tokens",
+        metrics::Unit::Count,
+        "Generated tokens per request"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_count",
+        metrics::Unit::Count,
+        "Inference calls per method (prefill or decode)"
+    );
+    metrics::describe_counter!(
+        "tgi_request_count",
+        metrics::Unit::Count,
+        "Total number of requests"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_success",
+        metrics::Unit::Count,
+        "Number of successful inference calls per method (prefill or decode)"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_current_size",
+        metrics::Unit::Count,
+        "Current batch size"
+    );
+    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
+    metrics::describe_gauge!(
+        "tgi_batch_current_max_tokens",
+        metrics::Unit::Count,
+        "Maximum tokens for the current batch"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_max_new_tokens",
+        metrics::Unit::Count,
+        "Maximum new tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_inference_duration",
+        metrics::Unit::Seconds,
+        "Batch inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_forward_duration",
+        metrics::Unit::Seconds,
+        "Batch forward duration per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_skipped_tokens",
+        metrics::Unit::Count,
+        "Speculated tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_filter_duration",
+        metrics::Unit::Seconds,
+        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Time spent in the queue per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Time spent validating the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Total time spent processing the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_decode_duration",
+        metrics::Unit::Seconds,
+        "Time spent decoding a batch per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_input_length",
+        metrics::Unit::Count,
+        "Input token length per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_next_size",
+        metrics::Unit::Count,
+        "Batch size of the next batch"
+    );
+
     // CORS layer
     let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
     let cors_layer = CorsLayer::new()
diff --git a/update_doc.py b/update_doc.py
index e887e1c6..3fb0d314 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -63,7 +63,7 @@ def check_cli(check: bool):
     final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
     block = []
 
-    filename = "docs/source/basic_tutorials/launcher.md"
+    filename = "docs/source/reference/launcher.md"
     if check:
         with open(filename, "r") as f:
             doc = f.read()