text-generation-inference/router/client/src/sharded_client.rs

/// Multi shard Client
use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
use crate::{ClientError, Result};
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.warmup(max_input_length, max_prefill_tokens)))
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<Option<u32>>>>()?;
        Ok(results.into_iter().flatten().min())
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone())))
            .collect();
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
            join_all(futures).await.into_iter().collect();
        merge_generations(results?)
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
            join_all(futures).await.into_iter().collect();
        merge_generations(results?)
    }
}

/// Merge generations from the different model shards
fn merge_generations(
    mut results: Vec<(Vec<Generation>, Option<CachedBatch>)>,
) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
    let (mut generations, next_batch) = results.pop().ok_or(ClientError::EmptyResults)?;

    for (mut shard_generations, _) in results.into_iter() {
        generations.append(&mut shard_generations);
    }
    Ok((generations, next_batch))
}
v0.1.0 2022-10-18 13:19:03 +00:00			`/// Multi shard Client`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`use crate::{ClientError, Result};`
Init 2022-10-08 10:30:12 +00:00			`use futures::future::join_all;`
			`use tonic::transport::Uri;`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`use tracing::instrument;`
Init 2022-10-08 10:30:12 +00:00
feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 18:23:54 +00:00			`#[derive(Debug, Clone)]`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// Text Generation Inference gRPC multi client`
Init 2022-10-08 10:30:12 +00:00			`pub struct ShardedClient {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`clients: Vec<Client>,`
Init 2022-10-08 10:30:12 +00:00			`}`

			`impl ShardedClient {`
v0.1.0 2022-10-18 13:19:03 +00:00			`fn new(clients: Vec<Client>) -> Self {`
feat(server): Support bitsandbytes 2022-10-27 12:25:29 +00:00			`Self { clients }`
Init 2022-10-08 10:30:12 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Create a new ShardedClient from a master client. The master client will communicate with`
			/// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`async fn from_master_client(mut master_client: Client) -> Result<Self> {`
v0.1.0 2022-10-18 13:19:03 +00:00			`// Get all uris/unix sockets from the master client`
feat(server): add watermarking tests (#248) 2023-04-27 17:16:35 +00:00			`let uris = master_client.service_discovery().await?;`
v0.1.0 2022-10-18 13:19:03 +00:00			`let futures = uris.into_iter().map(Client::connect_uds);`
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();`
			`Ok(Self::new(clients?))`
Init 2022-10-08 10:30:12 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Returns a client connected to the given uri`
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`pub async fn connect(uri: Uri) -> Result<Self> {`
			`let master_client = Client::connect(uri).await?;`
Init 2022-10-08 10:30:12 +00:00			`Self::from_master_client(master_client).await`
			`}`

feat: Improve error handling 2022-10-17 12:59:00 +00:00			`/// Returns a client connected to the given unix socket`
			`pub async fn connect_uds(path: String) -> Result<Self> {`
			`let master_client = Client::connect_uds(path).await?;`
Init 2022-10-08 10:30:12 +00:00			`Self::from_master_client(master_client).await`
			`}`

feat(router): add device and dtype info (#215) 2023-04-21 13:36:29 +00:00			`/// Get the model info`
			`#[instrument(skip(self))]`
			`pub async fn info(&mut self) -> Result<ShardInfo> {`
			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
			`.map(\|client\| client.info())`
			`.collect();`
			`join_all(futures).await.pop().unwrap()`
			`}`

feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 18:23:54 +00:00			`/// GRPC health check`
			`#[instrument(skip(self))]`
			`pub async fn health(&mut self) -> Result<HealthResponse> {`
			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
			`.map(\|client\| client.health())`
			`.collect();`
			`join_all(futures).await.pop().unwrap()`
			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Clear the past generations cache`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`#[instrument(skip(self))]`
feat(server): clear cache on error (#143) 2023-03-28 09:29:35 +00:00			`pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat(server): clear cache on error (#143) 2023-03-28 09:29:35 +00:00			`.map(\|client\| client.clear_cache(batch_id))`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.collect();`
			`join_all(futures).await.into_iter().collect()`
			`}`

feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`/// Filter a cached batch`
			`#[instrument(skip(self))]`
			`pub async fn filter_batch(`
			`&mut self,`
			`batch_id: u64,`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`request_ids: Vec<u64>,`
			`) -> Result<Option<CachedBatch>> {`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`.map(\|client\| Box::pin(client.filter_batch(batch_id, request_ids.clone())))`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`.collect();`
			`// all shards return the same message`
			`join_all(futures).await.pop().unwrap()`
			`}`

feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`/// Warmup on a max size batch`
			`///`
			`/// Returns the maximum amount of tokens supported by the hardware`
			`#[instrument(skip(self))]`
			`pub async fn warmup(`
			`&mut self,`
			`max_input_length: u32,`
			`max_prefill_tokens: u32,`
feat(server): auto max_batch_total_tokens for flash att models (#630) 2023-07-19 07:31:25 +00:00			`) -> Result<Option<u32>> {`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat(server): auto max_batch_total_tokens for flash att models (#630) 2023-07-19 07:31:25 +00:00			`.map(\|client\| Box::pin(client.warmup(max_input_length, max_prefill_tokens)))`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`.collect();`
feat: add cuda memory fraction (#659) Close #673 2023-07-24 09:43:58 +00:00			`// Take the minimum value`
			`let results = join_all(futures)`
			`.await`
			`.into_iter()`
			`.collect::<Result<Vec<Option<u32>>>>()?;`
			`Ok(results.into_iter().flatten().min())`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Generate one token for each request in the given batch`
			`///`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Returns Generation for each request in batch`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// and the next cached batch`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`#[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`pub async fn prefill(`
			`&mut self,`
			`batch: Batch,`
			`) -> Result<(Vec<Generation>, Option<CachedBatch>)> {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.map(\|client\| Box::pin(client.prefill(batch.clone())))`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`.collect();`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`join_all(futures).await.into_iter().collect();`
			`merge_generations(results?)`
Init 2022-10-08 10:30:12 +00:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Generate one token for each request in the given cached batches`
v0.1.0 2022-10-18 13:19:03 +00:00			`///`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Returns Generation for each request in batches`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// and the next cached batch`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`#[instrument(skip_all, fields(size = batches.iter().map(\|batch\|{batch.size}).sum::<u32>()))]`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`pub async fn decode(`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`&mut self,`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`batches: Vec<CachedBatch>,`
			`) -> Result<(Vec<Generation>, Option<CachedBatch>)> {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.map(\|client\| Box::pin(client.decode(batches.clone())))`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`.collect();`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`join_all(futures).await.into_iter().collect();`
			`merge_generations(results?)`
			`}`
			`}`

			`/// Merge generations from the different model shards`
			`fn merge_generations(`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`mut results: Vec<(Vec<Generation>, Option<CachedBatch>)>,`
			`) -> Result<(Vec<Generation>, Option<CachedBatch>)> {`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`let (mut generations, next_batch) = results.pop().ok_or(ClientError::EmptyResults)?;`

			`for (mut shard_generations, _) in results.into_iter() {`
			`generations.append(&mut shard_generations);`
Refactored gRPC interface Added validation logic 2022-10-11 14:50:54 +00:00			`}`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`Ok((generations, next_batch))`
Init 2022-10-08 10:30:12 +00:00			`}`