text-generation-inference/backends/client/src/v3/sharded_client.rs

/// Multi shard Client
use crate::{v3, Health, ShardInfo};
use crate::{ClientError, Result};

use crate::v3::{Chunk, InfoResponse, Input};
use async_trait::async_trait;
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;
use v3::client::{DecodeTimings, PrefillTimings};
use v3::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
                Box::pin(client.warmup(
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                ))
            })
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<Option<u32>>>>()?;
        Ok(results.into_iter().flatten().min())
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }
}

impl From<InfoResponse> for ShardInfo {
    fn from(value: InfoResponse) -> Self {
        Self {
            requires_padding: value.requires_padding,
            dtype: value.dtype,
            device_type: value.device_type,
            window_size: value.window_size,
            speculate: value.speculate,
        }
    }
}

#[async_trait]
impl Health for ShardedClient {
    async fn device_health(&self) -> Result<()> {
        self.clone().health().await?;
        Ok(())
    }

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            input_chunks: Some(Input {
                chunks: vec![Chunk::Text("liveness".into()).into()],
            }),
            truncate: 10,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                frequency_penalty: 0.0,
                watermark: false,
                grammar: String::new(),
                grammar_type: GrammarType::None as i32,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: 1,
                stop_sequences: vec![],
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
            // Block 0 is reserved for health checks
            blocks: vec![0],
            slots: (0..16).collect(),
            prefix_len: 0,
            adapter_id: None,
        };
        let batch = Batch {
            id: u64::MAX,
            requests: vec![liveness_request],
            size: 1,
            max_tokens: 2,
            max_blocks: 1,
        };
        self.clone().prefill(batch).await?;
        Ok(())
    }
}
v0.1.0 2022-10-18 13:19:03 +00:00			`/// Multi shard Client`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00			`use crate::{v3, Health, ShardInfo};`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`use crate::{ClientError, Result};`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00
			`use crate::v3::{Chunk, InfoResponse, Input};`
			`use async_trait::async_trait;`
Init 2022-10-08 10:30:12 +00:00			`use futures::future::join_all;`
			`use tonic::transport::Uri;`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`use tracing::instrument;`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00			`use v3::client::{DecodeTimings, PrefillTimings};`
			`use v3::{`
			`Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,`
			`NextTokenChooserParameters, Request, StoppingCriteriaParameters,`
			`};`
Init 2022-10-08 10:30:12 +00:00
feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 18:23:54 +00:00			`#[derive(Debug, Clone)]`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// Text Generation Inference gRPC multi client`
Init 2022-10-08 10:30:12 +00:00			`pub struct ShardedClient {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`clients: Vec<Client>,`
Init 2022-10-08 10:30:12 +00:00			`}`

			`impl ShardedClient {`
v0.1.0 2022-10-18 13:19:03 +00:00			`fn new(clients: Vec<Client>) -> Self {`
feat(server): Support bitsandbytes 2022-10-27 12:25:29 +00:00			`Self { clients }`
Init 2022-10-08 10:30:12 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Create a new ShardedClient from a master client. The master client will communicate with`
			/// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`async fn from_master_client(mut master_client: Client) -> Result<Self> {`
v0.1.0 2022-10-18 13:19:03 +00:00			`// Get all uris/unix sockets from the master client`
feat(server): add watermarking tests (#248) 2023-04-27 17:16:35 +00:00			`let uris = master_client.service_discovery().await?;`
v0.1.0 2022-10-18 13:19:03 +00:00			`let futures = uris.into_iter().map(Client::connect_uds);`
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();`
			`Ok(Self::new(clients?))`
Init 2022-10-08 10:30:12 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Returns a client connected to the given uri`
feat: Improve error handling 2022-10-17 12:59:00 +00:00			`pub async fn connect(uri: Uri) -> Result<Self> {`
			`let master_client = Client::connect(uri).await?;`
Init 2022-10-08 10:30:12 +00:00			`Self::from_master_client(master_client).await`
			`}`

feat: Improve error handling 2022-10-17 12:59:00 +00:00			`/// Returns a client connected to the given unix socket`
			`pub async fn connect_uds(path: String) -> Result<Self> {`
			`let master_client = Client::connect_uds(path).await?;`
Init 2022-10-08 10:30:12 +00:00			`Self::from_master_client(master_client).await`
			`}`

feat(router): add device and dtype info (#215) 2023-04-21 13:36:29 +00:00			`/// Get the model info`
			`#[instrument(skip(self))]`
			`pub async fn info(&mut self) -> Result<ShardInfo> {`
			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
			`.map(\|client\| client.info())`
			`.collect();`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00			`join_all(futures).await.pop().unwrap().map(ShardInfo::from)`
feat(router): add device and dtype info (#215) 2023-04-21 13:36:29 +00:00			`}`

feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 18:23:54 +00:00			`/// GRPC health check`
			`#[instrument(skip(self))]`
			`pub async fn health(&mut self) -> Result<HealthResponse> {`
			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
			`.map(\|client\| client.health())`
			`.collect();`
			`join_all(futures).await.pop().unwrap()`
			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Clear the past generations cache`
feat: add distributed tracing (#62) 2023-02-13 12:02:45 +00:00			`#[instrument(skip(self))]`
feat(server): clear cache on error (#143) 2023-03-28 09:29:35 +00:00			`pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat(server): clear cache on error (#143) 2023-03-28 09:29:35 +00:00			`.map(\|client\| client.clear_cache(batch_id))`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.collect();`
			`join_all(futures).await.into_iter().collect()`
			`}`

feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`/// Filter a cached batch`
			`#[instrument(skip(self))]`
			`pub async fn filter_batch(`
			`&mut self,`
			`batch_id: u64,`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`request_ids: Vec<u64>,`
			`) -> Result<Option<CachedBatch>> {`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`.map(\|client\| Box::pin(client.filter_batch(batch_id, request_ids.clone())))`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 15:59:00 +00:00			`.collect();`
			`// all shards return the same message`
			`join_all(futures).await.pop().unwrap()`
			`}`

feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`/// Warmup on a max size batch`
			`///`
			`/// Returns the maximum amount of tokens supported by the hardware`
			`#[instrument(skip(self))]`
			`pub async fn warmup(`
			`&mut self,`
			`max_input_length: u32,`
			`max_prefill_tokens: u32,`
#1049 CI (#1178) See #1049 --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Wang, Yi <yi.a.wang@intel.com> 2023-10-20 08:28:45 +00:00			`max_total_tokens: u32,`
feat(router): add max_batch_size (#1542) Some hardware require a maximum batch size. 2024-02-09 11:38:41 +00:00			`max_batch_size: Option<usize>,`
feat(server): auto max_batch_total_tokens for flash att models (#630) 2023-07-19 07:31:25 +00:00			`) -> Result<Option<u32>> {`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
#1049 CI (#1178) See #1049 --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Wang, Yi <yi.a.wang@intel.com> 2023-10-20 08:28:45 +00:00			`.map(\|client\| {`
feat(router): add max_batch_size (#1542) Some hardware require a maximum batch size. 2024-02-09 11:38:41 +00:00			`Box::pin(client.warmup(`
			`max_input_length,`
			`max_prefill_tokens,`
			`max_total_tokens,`
			`max_batch_size,`
			`))`
#1049 CI (#1178) See #1049 --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Wang, Yi <yi.a.wang@intel.com> 2023-10-20 08:28:45 +00:00			`})`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`.collect();`
feat: add cuda memory fraction (#659) Close #673 2023-07-24 09:43:58 +00:00			`// Take the minimum value`
			`let results = join_all(futures)`
			`.await`
			`.into_iter()`
			`.collect::<Result<Vec<Option<u32>>>>()?;`
			`Ok(results.into_iter().flatten().min())`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 17:09:59 +00:00			`}`

v0.1.0 2022-10-18 13:19:03 +00:00			`/// Generate one token for each request in the given batch`
			`///`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Returns Generation for each request in batch`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// and the next cached batch`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`#[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`pub async fn prefill(`
			`&mut self,`
			`batch: Batch,`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.map(\|client\| Box::pin(client.prefill(batch.clone())))`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`.collect();`
chore: bump rust version and annotate/fix all clippy warnings (#1455) This PR just bumps the latest rust version and makes clippy happy ```bash cargo clippy --all -- -D warnings # Finished dev [unoptimized + debuginfo] target(s) in 0.10s ``` 2024-01-22 14:22:54 +00:00			`#[allow(clippy::type_complexity)]`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`join_all(futures).await.into_iter().collect();`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`let mut results = results?;`

			`let (mut generations, next_batch, mut timings) =`
			`results.pop().ok_or(ClientError::EmptyResults)?;`

			`// Merge generations from different model shards`
			`for (mut shard_generations, _, shard_timings) in results.into_iter() {`
			`generations.append(&mut shard_generations);`
			`// Return the timings of the slowest shard`
			`if shard_timings.total > timings.total {`
			`timings = shard_timings;`
			`}`
			`}`
			`Ok((generations, next_batch, timings))`
Init 2022-10-08 10:30:12 +00:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Generate one token for each request in the given cached batches`
v0.1.0 2022-10-18 13:19:03 +00:00			`///`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`/// Returns Generation for each request in batches`
v0.1.0 2022-10-18 13:19:03 +00:00			`/// and the next cached batch`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`#[instrument(skip_all, fields(size = batches.iter().map(\| batch \| {batch.size}).sum::< u32 > ()))]`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`pub async fn decode(`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`&mut self,`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 17:19:57 +00:00			`batches: Vec<CachedBatch>,`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`let futures: Vec<_> = self`
			`.clients`
			`.iter_mut()`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 16:04:00 +00:00			`.map(\|client\| Box::pin(client.decode(batches.clone())))`
feat(client): Simplify sharded logic 2022-10-22 21:40:05 +00:00			`.collect();`
chore: bump rust version and annotate/fix all clippy warnings (#1455) This PR just bumps the latest rust version and makes clippy happy ```bash cargo clippy --all -- -D warnings # Finished dev [unoptimized + debuginfo] target(s) in 0.10s ``` 2024-01-22 14:22:54 +00:00			`#[allow(clippy::type_complexity)]`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00			`join_all(futures).await.into_iter().collect();`
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`let mut results = results?;`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`let (mut generations, next_batch, mut timings) =`
			`results.pop().ok_or(ClientError::EmptyResults)?;`
feat(server): shard token decode (#303) 2023-05-10 13:48:21 +00:00
feat: add more latency metrics in forward (#1346) 2023-12-14 14:59:38 +00:00			`// Merge generations from different model shards`
			`for (mut shard_generations, _, shard_timings) in results.into_iter() {`
			`generations.append(&mut shard_generations);`
			`// Return the timings of the slowest shard`
			`if shard_timings.total > timings.total {`
			`timings = shard_timings;`
			`}`
			`}`
			`Ok((generations, next_batch, timings))`
Refactored gRPC interface Added validation logic 2022-10-11 14:50:54 +00:00			`}`
Init 2022-10-08 10:30:12 +00:00			`}`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00
			`impl From<InfoResponse> for ShardInfo {`
			`fn from(value: InfoResponse) -> Self {`
			`Self {`
			`requires_padding: value.requires_padding,`
			`dtype: value.dtype,`
			`device_type: value.device_type,`
			`window_size: value.window_size,`
			`speculate: value.speculate,`
			`}`
			`}`
			`}`

			`#[async_trait]`
			`impl Health for ShardedClient {`
			`async fn device_health(&self) -> Result<()> {`
			`self.clone().health().await?;`
			`Ok(())`
			`}`

			`async fn model_health(&self) -> Result<()> {`
			`// Dummy batch of 1 token and 1 generated token`
			`let liveness_request = Request {`
			`id: u64::MAX,`
			`inputs: "liveness".to_string(),`
			`input_chunks: Some(Input {`
			`chunks: vec![Chunk::Text("liveness".into()).into()],`
			`}),`
			`truncate: 10,`
			`prefill_logprobs: false,`
			`parameters: Some(NextTokenChooserParameters {`
			`temperature: 1.0,`
			`top_k: 0,`
			`top_p: 1.0,`
			`typical_p: 1.0,`
			`do_sample: false,`
			`seed: 0,`
			`repetition_penalty: 1.0,`
			`frequency_penalty: 0.0,`
			`watermark: false,`
			`grammar: String::new(),`
			`grammar_type: GrammarType::None as i32,`
			`}),`
			`stopping_parameters: Some(StoppingCriteriaParameters {`
			`max_new_tokens: 1,`
			`stop_sequences: vec![],`
			`ignore_eos_token: false,`
			`}),`
			`top_n_tokens: 0,`
feat: move allocation logic to rust (#1835) Close #2007 2024-06-05 10:18:38 +00:00			`// Block 0 is reserved for health checks`
			`blocks: vec![0],`
			`slots: (0..16).collect(),`
Add support for prefix caching to the v3 router (#2392) This change adds support for prefix caching to the v3 router. This is broken up from the backend support to ease reviewing. For now prefix caching is only enabled with `USE_PREFIX_CACHING=1` in this case, the router will switch to `RadixAllocator`. This allocator uses a radix trie to keep track of prefills that were seen prior. If a new prefill is a prefix of a previously-seen prefil, the router will send a request with `prefix_len>0`, which can be used by the backend to decide to reuse KV blocks from the cache, rather than recomputing them. Even though backend support is not added in this PR, the backend will still work with prefix caching enabled. The prefix lengths are just ignored and not used. 2024-08-12 12:59:17 +00:00			`prefix_len: 0,`
Enable multiple LoRa adapters (#2010) * feat: first draft load multiple lora * feat: load weights within layer and refactor lora pass * fix: refactor and reduce lora math * feat: baseline impl single request multi lora support * feat: prefer lorax implementation and port loading logic * fix: prefer adapter_data and refactors * feat: perfer loraxs custom punica kernels and add mlp loras * fix: adjust batch for bgmv * fix: adjust adapter_segments logic when in batch * fix: refactor and move changes to v3 proto * fix: pass model_id for all flash causal lms * fix: pass model_id for all causal and seq2seq lms * fix: add model_id to model test * feat: add lora support to mistral and refactors * feat: prefer model id in request * fix: include rust code for adapter id * feat: bump launcher and add new lora docs * feat: support base model generation and refactors * fix: rename doc to retry ci build * feat: support if vlm models * fix: add adapter_data param and avoid missing layers * fix: add adapter_data param to phi and neox * fix: update all models forwards to include adapter_data * fix: add model_id to IdeficsCausalLM * Update lora.md Fixed a typo * Update lora.md Fixing spam image * fix: add lora kernel to dockerfile, support running without kernels and refactors * fix: avoid dockerfile conflict * fix: refactors and adjust flash llama lora logic * fix: skip llama test due to CI issue (temp) * fix: skip llama test CI (temp) 2 * fix: revert skips and prefer updated ci token for tests * fix: refactors and helpful comments * fix: add noop in TensorParallelAdapterRowLinear too * fix: refactor and move shard_lora_weights logic * fix: exit early if no adapter_data --------- Co-authored-by: Derek <datavistics@gmail.com> 2024-06-25 18:46:27 +00:00			`adapter_id: None,`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00			`};`
			`let batch = Batch {`
			`id: u64::MAX,`
			`requests: vec![liveness_request],`
			`size: 1,`
			`max_tokens: 2,`
feat: move allocation logic to rust (#1835) Close #2007 2024-06-05 10:18:38 +00:00			`max_blocks: 1,`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 13:56:56 +00:00			`};`
			`self.clone().prefill(batch).await?;`
			`Ok(())`
			`}`
			`}`