improved instrumentation

2025-09-09 11:24:53 +00:00 · 2023-02-10 15:30:53 +01:00 · 2023-02-10 15:30:53 +01:00 · 67cd625c82
commit 67cd625c82
parent f81f0828d7
8 changed files with 25 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -27,6 +27,7 @@ to power LLMs api-inference widgets.
  - [Docker](#docker)
  - [API Documentation](#api-documentation)
  - [A note on Shared Memory](#a-note-on-shared-memory-shm)
+  - [Distributed Tracing](#distributed-tracing)
  - [Local Install](#local-install)
  - [CUDA Kernels](#cuda-kernels)
 - [Run BLOOM](#run-bloom)
@ -103,6 +104,11 @@ curl 127.0.0.1:8080/generate_stream \
 You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
 The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).

+### Distributed Tracing
+
+`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+
 ### A note on Shared Memory (shm)

 [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by 
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -66,7 +66,7 @@ impl Client {
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
-    #[instrument(skip(self))]
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(&mut self, batch: Batch) -> Result<(Vec<Generation>, Option<Batch>)> {
        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
@ -77,7 +77,7 @@ impl Client {
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
-    #[instrument(skip(self))]
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<Batch>,
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -53,7 +53,7 @@ impl ShardedClient {
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
-    #[instrument(skip(self))]
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(&mut self, batch: Batch) -> Result<(Vec<Generation>, Option<Batch>)> {
        let futures: Vec<_> = self
            .clients
@ -69,7 +69,7 @@ impl ShardedClient {
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
-    #[instrument(skip(self))]
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<Batch>,
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -243,10 +243,11 @@ async fn batching_task(
                    }
                }
                // Create span for this batch to add context to inference calls
-                let next_batch_span = info_span!(parent: None, "batch");
+                let next_batch_size = entries.len();
+                let next_batch_span = info_span!(parent: None, "batch", batch_size = next_batch_size);
                entries.iter_mut().for_each(|(_, entry)| {
                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
                    // Add relationship
                    entry_batch_span.follows_from(&next_batch_span);
                    // Update entry
@ -263,7 +264,7 @@ async fn batching_task(
 }

 /// Wrap a future inside a match statement to handle errors and send the responses to Infer
-#[instrument(skip(future))]
+#[instrument(skip_all)]
 async fn wrap_future(
    future: impl Future<Output = Result<(Vec<Generation>, Option<Batch>), ClientError>>,
    entries: &mut IntMap<u64, Entry>,
@ -282,7 +283,7 @@ async fn wrap_future(
 }

 /// Send errors to Infer for all `entries`
-#[instrument]
+#[instrument(skip_all)]
 fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
    entries.drain().for_each(|(_, entry)| {
        // Create and enter a span to link this function back to the entry
@ -299,7 +300,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
 }

 /// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-#[instrument]
+#[instrument(skip_all)]
 fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
    generations.into_iter().for_each(|generation| {
        // Get entry
@ -309,7 +310,7 @@ fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entr
            .expect("ID not found in entries. This is a bug.");

        // Create and enter a span to link this function back to the entry
-        let _generation_span = info_span!(parent: entry.batch_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation").entered();
+        let _generation_span = info_span!(parent: entry.batch_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();

        if let Some(prefill_tokens) = generation.prefill_tokens {
            // Send message
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -121,7 +121,6 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
        true => fmt_layer
            .json()
            .flatten_event(true)
-            .with_span_list(false)
            .boxed(),
        false => fmt_layer.boxed(),
    };
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@ -48,7 +48,7 @@ impl Queue {
    }

    /// Append an entry to the queue
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
        // Unwrap is safe here
@ -143,12 +143,12 @@ impl State {
            }
        }

-        // Create span for this batch to add context to inference calls
-        let next_batch_span = info_span!(parent: None, "batch");
-        next_batch_span.follows_from(&Span::current());
-
        let next_batch_size = min(self.entries.len(), max_size);

+        // Create span for this batch to add context to inference calls
+        let next_batch_span = info_span!(parent: None, "batch", batch_size = next_batch_size);
+        next_batch_span.follows_from(&Span::current());
+
        let mut batch_requests = Vec::with_capacity(next_batch_size);
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(next_batch_size, BuildNoHashHasher::default());
@ -158,7 +158,7 @@ impl State {
            .drain(..next_batch_size)
            .for_each(|(id, mut entry)| {
                // Create a new span to link the batch back to this entry
-                let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                let entry_batch_span = info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
                // Add relationship
                entry_batch_span.follows_from(&next_batch_span);
                // Update entry
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -37,7 +37,7 @@ impl Validation {
    }

    /// Validate a payload and get the number of tokens in the input
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
    pub(crate) async fn validate(
        &self,
        request: GenerateRequest,
--- a/server/text_generation/cli.py
+++ b/server/text_generation/cli.py
@ -4,7 +4,6 @@ import typer

 from pathlib import Path
 from loguru import logger
-from typer import Argument
 from typing import Optional

 from text_generation import server, utils
@ -22,7 +21,7 @@ def serve(
    uds_path: Path = "/tmp/text-generation",
    logger_level: str = "INFO",
    json_output: bool = False,
-    otlp_endpoint: Optional[str] = Argument(None, envvar="OTLP_ENDPOINT"),
+    otlp_endpoint: Optional[str] = None,
 ):
    if sharded:
        assert (