Fix clippy and fmt.

2025-09-12 12:54:52 +00:00 · 2024-08-09 14:54:52 +02:00 · 2024-08-09 14:54:52 +02:00 · a4b1806557
commit a4b1806557
parent 379e1659a9
6 changed files with 33 additions and 20 deletions
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -6,7 +6,7 @@ use nohash_hasher::IntMap;
 use std::sync::Arc;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
-use text_generation_router::{FinishReason, PrefillToken, Token, Attention};
+use text_generation_router::{Attention, FinishReason, PrefillToken, Token};
 use tokio::sync::mpsc::error::SendError;
 use tokio::sync::{mpsc, Notify};
 use tokio::time::Instant;
@ -36,11 +36,17 @@ impl BackendV3 {
        speculate: u32,
    ) -> Self {
        let attention = if let Ok(attention) = std::env::var("ATTENTION") {
-            attention.parse().unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
+            attention
                .parse()
                .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
        } else {
            Attention::Paged
        };
-        let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
+        let block_size = if attention == Attention::FlashDecoding {
            256
        } else {
            16
        };
        let queue = Queue::new(
            requires_padding,
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@ -1,11 +1,10 @@
 /// Batching and inference logic
 use crate::infer::v2::queue::{Entry, Queue};
 use crate::infer::{
-    Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
+    Attention, Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
    Attention,
 };
 use crate::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token, Attention};
+use crate::{Attention, FinishReason, PrefillToken, Token};
 use nohash_hasher::IntMap;
 use std::sync::{
    atomic::{AtomicBool, Ordering},
@ -42,11 +41,17 @@ impl BackendV2 {
    ) -> Self {
        // Infer shared state
        let attention = if let Ok(attention) = std::env::var("ATTENTION") {
-            attention.parse().expect(&format!("Invalid attention was specified :`{attention}`"))
+            attention
                .parse()
                .expect(&format!("Invalid attention was specified :`{attention}`"))
        } else {
            Attention::Paged
        };
-        let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
+        let block_size = if attention == Attention::FlashDecoding {
            256
        } else {
            16
        };
        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
        let batching_task_notifier = Arc::new(Notify::new());
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -16,7 +16,7 @@ use utoipa::ToSchema;
 use validation::Validation;
 #[derive(PartialEq)]
-pub enum Attention{
+pub enum Attention {
    Paged,
    FlashDecoding,
    FlashInfer,
@ -25,21 +25,21 @@ pub enum Attention{
 #[derive(Debug)]
 pub struct ParseError;
-impl std::fmt::Display for ParseError{
+impl std::fmt::Display for ParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Cannot parse attention value")
    }
 }
-impl std::error::Error for ParseError{}
+impl std::error::Error for ParseError {}
-impl std::str::FromStr for Attention{
+impl std::str::FromStr for Attention {
    type Err = ParseError;
-    fn from_str(s: &str) -> Result<Self, Self::Err>{
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s{
+        match s {
            "paged" => Ok(Attention::Paged),
            "flashdecoding" => Ok(Attention::FlashDecoding),
            "flashinfer" => Ok(Attention::FlashInfer),
-            _ => Err(ParseError)
+            _ => Err(ParseError),
        }
    }
 }
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -6,8 +6,10 @@ from typing import Dict, Optional
 from text_generation_server.utils.log import log_master
 ATTENTION = os.getenv("ATTENTION", "paged")
-_expected  = {"paged", "flashdecoding", "flashinfer"}
+_expected = {"paged", "flashdecoding", "flashinfer"}
-assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
+assert (
    ATTENTION in _expected
 ), f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None