Much simpler solution.

2025-09-11 12:24:53 +00:00 · 2024-12-10 18:55:21 +01:00 · 2024-12-10 18:55:21 +01:00 · e47249a3ea
commit e47249a3ea
parent 45a86d5cf0
7 changed files with 25 additions and 44 deletions
--- a/backends/v2/src/backend.rs
+++ b/backends/v2/src/backend.rs
@ -104,6 +104,10 @@ impl Backend for BackendV2 {
        }
        .is_ok()
    }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }

 /// Batching logic
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -111,6 +111,10 @@ impl Backend for BackendV3 {
        }
        .is_ok()
    }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }

 /// Batching logic
--- a/backends/v3/src/block_allocator.rs
+++ b/backends/v3/src/block_allocator.rs
@ -147,8 +147,8 @@ impl SimpleAllocator {
    fn new(blocks: u32, block_size: u32, window_size: Option<u32>) -> Self {
        SimpleAllocator {
            block_size,
-            // XXX: Block 0&1 is reserved for health checks
-            free_blocks: (2..blocks).collect(),
+            // Block 0 is reserved for health checks
+            free_blocks: (1..blocks).collect(),
            window_size,
        }
    }
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@ -37,8 +37,6 @@ pub enum ClientError {
    Generation(String),
    #[error("Sharded results are empty")]
    EmptyResults,
-    #[error("Invalid attention {0}")]
-    InvalidAttention(String),
 }

 impl From<Status> for ClientError {
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -13,38 +13,15 @@ use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;

-#[derive(Debug, Clone, Copy)]
-pub enum Attn {
-    Flashdecoding,
-    Flashinfer,
-    Paged,
-}
-
-impl TryFrom<&str> for Attn {
-    type Error = ClientError;
-    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
-        match value {
-            "flashdecoding" => Ok(Attn::Flashdecoding),
-            "flashinfer" => Ok(Attn::Flashinfer),
-            "paged" => Ok(Attn::Paged),
-            string => Err(ClientError::InvalidAttention(string.to_string())),
-        }
-    }
-}
-
 #[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
 pub struct ShardedClient {
    clients: Vec<Client>,
-    attention_impl: Option<Attn>,
 }

 impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
-        Self {
-            clients,
-            attention_impl: None,
-        }
+        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
@ -78,9 +55,7 @@ impl ShardedClient {
            .iter_mut()
            .map(|client| client.info())
            .collect();
-        let info = join_all(futures).await.pop().unwrap()?;
-        self.attention_impl = Some((&*info.attention_impl).try_into()?);
-        Ok(info)
+        join_all(futures).await.pop().unwrap()
    }

    /// GRPC health check
@ -236,20 +211,14 @@ impl Health for ShardedClient {

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
-        //
-        let (blocks, slots) = match self.attention_impl.expect("Attention to be set") {
-            Attn::Paged => (vec![0], (0..2).collect()),
-            Attn::Flashinfer => (vec![0, 1], (0..2).collect()),
-            Attn::Flashdecoding => (vec![0], (0..2).collect()),
-        };
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            input_chunks: Some(Input {
                chunks: vec![Chunk::Text("liveness".into()).into()],
            }),
-            truncate: 10,
-            add_special_tokens: true,
+            truncate: 1,
+            add_special_tokens: false,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
@ -270,8 +239,9 @@ impl Health for ShardedClient {
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
-            blocks,
-            slots,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: vec![0],
            cache_len: 0,
            adapter_id: None,
            chunk_len: None,
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@ -42,8 +42,8 @@ impl RadixAllocator {
            allocations: HashMap::new(),
            cache_blocks: RadixTrie::new(block_size as usize),

-            // XXX: Block 0,1 is reserved for health checks.
-            free_blocks: (2..n_blocks).collect(),
+            // Block 0 is reserved for health checks.
+            free_blocks: (1..n_blocks).collect(),
            window_size,
            block_size,
        }
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -33,6 +33,11 @@ pub trait Backend {
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError>;

    async fn health(&self, current_health: bool) -> bool;
+
+    /// The state of the health on startup
+    /// Typically false, or true if the backend includes
+    /// a warmup phase.
+    fn start_health(&self) -> bool;
 }

 /// Inference struct
@ -75,7 +80,7 @@ impl Infer {
        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));

        // Backend health
-        let backend_health = Arc::new(AtomicBool::new(false));
+        let backend_health = Arc::new(AtomicBool::new(backend.start_health()));

        Self {
            validation,