add docs

2025-09-09 19:34:53 +00:00 · 2023-05-09 18:40:17 +02:00 · 2023-05-09 18:40:17 +02:00 · f0609e73d8
commit f0609e73d8
parent 89565b4eaf
4 changed files with 8 additions and 1 deletions
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -1,5 +1,5 @@
-use crate::{Batch, Client, Generation, HealthResponse, Request, ShardInfo};
 /// Multi shard Client
+use crate::{Batch, Client, Generation, HealthResponse, Request, ShardInfo};
 use crate::{ClientError, Result};
 use futures::future::join_all;
 use tonic::transport::Uri;
@ -123,6 +123,7 @@ impl ShardedClient {
    }
 }

+/// Merge generations from the different model shards
 fn merge_generations(
    mut results: Vec<(Vec<Generation>, Option<Batch>)>,
 ) -> Result<(Vec<Generation>, Option<Batch>)> {
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -572,6 +572,8 @@ class CausalLM(Model):
            if not stop:
                stopped = False

+            # Shard generations
+            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -690,6 +690,8 @@ class FlashCausalLM(Model):
            if not stop:
                stopped = False

+            # Shard generations
+            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@ -653,6 +653,8 @@ class Seq2SeqLM(Model):
            if not stop:
                stopped = False

+            # Shard generations
+            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Slice with decoder_input_length to remove padding