From fc2d1134b8c3cf94c7e568e33443e0f314bfdcae Mon Sep 17 00:00:00 2001 From: Hugo Larcher Date: Mon, 15 Jul 2024 14:15:55 +0200 Subject: [PATCH] doc: Add metrics documentation and add a 'Reference' section --- .github/workflows/build_pr_documentation.yaml | 2 +- .pre-commit-config.yaml | 2 +- docs/source/_toctree.yml | 14 ++- .../launcher.md | 0 docs/source/reference/metrics.md | 30 +++++ router/src/server.rs | 114 ++++++++++++++++++ update_doc.py | 2 +- 7 files changed, 158 insertions(+), 6 deletions(-) rename docs/source/{basic_tutorials => reference}/launcher.md (100%) create mode 100644 docs/source/reference/metrics.md diff --git a/.github/workflows/build_pr_documentation.yaml b/.github/workflows/build_pr_documentation.yaml index bf03bfdf..a5ce39a5 100644 --- a/.github/workflows/build_pr_documentation.yaml +++ b/.github/workflows/build_pr_documentation.yaml @@ -11,7 +11,7 @@ concurrency: jobs: build: - uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main with: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6f5e685e..0c8b6885 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - exclude: docs/source/basic_tutorials/launcher.md + exclude: docs/source/reference/launcher.md - repo: https://github.com/psf/black rev: 24.2.0 hooks: diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index e97c00aa..3acfbc89 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -33,8 +33,6 @@ title: Serving Private & Gated Models - local: basic_tutorials/using_cli title: Using TGI CLI - - local: basic_tutorials/launcher - title: All TGI CLI options - local: basic_tutorials/non_core_models title: Non-core Model Serving - local: basic_tutorials/safety @@ -48,6 +46,16 @@ - local: basic_tutorials/train_medusa title: Train Medusa title: Tutorials +- sections: + - local: architecture + title: Internal Architecture + - local: reference/launcher + title: All TGI CLI options + - local: messages_api + title: Messages API + - local: reference/metrics + title: Exported Metrics + title: Reference - sections: - local: conceptual/streaming title: Streaming @@ -64,7 +72,7 @@ - local: conceptual/speculation title: Speculation (Medusa, ngram) - local: conceptual/guidance - title: How Guidance Works (via outlines + title: How Guidance Works (via outlines) - local: conceptual/lora title: LoRA (Low-Rank Adaptation) diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/reference/launcher.md similarity index 100% rename from docs/source/basic_tutorials/launcher.md rename to docs/source/reference/launcher.md diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md new file mode 100644 index 00000000..d34d38ea --- /dev/null +++ b/docs/source/reference/metrics.md @@ -0,0 +1,30 @@ +# Metrics + +TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint. +These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks. + +The following metrics are exposed: + +| Metric Name | Description | Type | Unit | +|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------| +| `tgi_batch_current_max_tokens` | Maximum tokens for the current batch | Gauge | Count | +| `tgi_batch_current_size` | Current batch size | Gauge | Count | +| `tgi_batch_decode_duration` | Time spent decoding a batch per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_filter_duration` | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_forward_duration` | Batch forward duration per method (prefill or decode) | Histogram | Seconds | +| `tgi_batch_inference_count` | Inference calls per method (prefill or decode) | Counter | Count | +| `tgi_batch_inference_duration` | Batch inference duration | Histogram | Seconds | +| `tgi_batch_inference_success` | Number of successful inference calls per method (prefill or decode) | Counter | Count | +| `tgi_batch_next_size` | Batch size of the next batch | Histogram | Count | +| `tgi_queue_size` | Current queue size | Gauge | Count | +| `tgi_request_count` | Total number of requests | Counter | Count | +| `tgi_request_duration` | Total time spent processing the request (e2e latency) | Histogram | Seconds | +| `tgi_request_generated_tokens` | Generated tokens per request | Histogram | Count | +| `tgi_request_inference_duration` | Request inference duration | Histogram | Seconds | +| `tgi_request_input_length` | Input token length per request | Histogram | Count | +| `tgi_request_max_new_tokens` | Maximum new tokens per request | Histogram | Count | +| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency) | Histogram | Seconds | +| `tgi_request_queue_duration` | Time spent in the queue per request | Histogram | Seconds | +| `tgi_request_skipped_tokens` | Speculated tokens per request | Histogram | Count | +| `tgi_request_success` | Number of successful requests | Counter | | +| `tgi_request_validation_duration` | Time spent validating the request | Histogram | Seconds | diff --git a/router/src/server.rs b/router/src/server.rs index 1d1cd36a..468899ae 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1973,6 +1973,120 @@ async fn start( .install_recorder() .expect("failed to install metrics recorder"); + // Metrics descriptions + metrics::describe_counter!("tgi_request_success", "Number of successful requests"); + metrics::describe_histogram!( + "tgi_request_duration", + metrics::Unit::Seconds, + "Request duration" + ); + metrics::describe_histogram!( + "tgi_request_validation_duration", + metrics::Unit::Seconds, + "Request validation duration" + ); + metrics::describe_histogram!( + "tgi_request_queue_duration", + metrics::Unit::Seconds, + "Request queue duration" + ); + metrics::describe_histogram!( + "tgi_request_inference_duration", + metrics::Unit::Seconds, + "Request inference duration" + ); + metrics::describe_histogram!( + "tgi_request_mean_time_per_token_duration", + metrics::Unit::Seconds, + "Mean time per token per request" + ); + metrics::describe_histogram!( + "tgi_request_generated_tokens", + metrics::Unit::Count, + "Generated tokens per request" + ); + metrics::describe_counter!( + "tgi_batch_inference_count", + metrics::Unit::Count, + "Inference calls per method (prefill or decode)" + ); + metrics::describe_counter!( + "tgi_request_count", + metrics::Unit::Count, + "Total number of requests" + ); + metrics::describe_counter!( + "tgi_batch_inference_success", + metrics::Unit::Count, + "Number of successful inference calls per method (prefill or decode)" + ); + metrics::describe_gauge!( + "tgi_batch_current_size", + metrics::Unit::Count, + "Current batch size" + ); + metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size"); + metrics::describe_gauge!( + "tgi_batch_current_max_tokens", + metrics::Unit::Count, + "Maximum tokens for the current batch" + ); + metrics::describe_histogram!( + "tgi_request_max_new_tokens", + metrics::Unit::Count, + "Maximum new tokens per request" + ); + metrics::describe_histogram!( + "tgi_batch_inference_duration", + metrics::Unit::Seconds, + "Batch inference duration" + ); + metrics::describe_histogram!( + "tgi_batch_forward_duration", + metrics::Unit::Seconds, + "Batch forward duration per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_skipped_tokens", + metrics::Unit::Count, + "Speculated tokens per request" + ); + metrics::describe_histogram!( + "tgi_batch_filter_duration", + metrics::Unit::Seconds, + "Time spent filtering batches and sending generated tokens per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_queue_duration", + metrics::Unit::Seconds, + "Time spent in the queue per request" + ); + metrics::describe_histogram!( + "tgi_request_validation_duration", + metrics::Unit::Seconds, + "Time spent validating the request" + ); + metrics::describe_histogram!( + "tgi_request_duration", + metrics::Unit::Seconds, + "Total time spent processing the request" + ); + metrics::describe_histogram!( + "tgi_batch_decode_duration", + metrics::Unit::Seconds, + "Time spent decoding a batch per method (prefill or decode)" + ); + metrics::describe_histogram!( + "tgi_request_input_length", + metrics::Unit::Count, + "Input token length per request" + ); + metrics::describe_histogram!( + "tgi_batch_next_size", + metrics::Unit::Count, + "Batch size of the next batch" + ); + // CORS layer let allow_origin = allow_origin.unwrap_or(AllowOrigin::any()); let cors_layer = CorsLayer::new() diff --git a/update_doc.py b/update_doc.py index e887e1c6..3fb0d314 100644 --- a/update_doc.py +++ b/update_doc.py @@ -63,7 +63,7 @@ def check_cli(check: bool): final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" block = [] - filename = "docs/source/basic_tutorials/launcher.md" + filename = "docs/source/reference/launcher.md" if check: with open(filename, "r") as f: doc = f.read()