Fix clippy and fmt.

2025-09-12 12:54:52 +00:00 · 2024-08-09 14:54:52 +02:00 · 2024-08-09 14:54:52 +02:00 · a4b1806557
commit a4b1806557
parent 379e1659a9
6 changed files with 33 additions and 20 deletions
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -6,7 +6,7 @@ use nohash_hasher::IntMap;
 use std::sync::Arc;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
-use text_generation_router::{FinishReason, PrefillToken, Token, Attention};
+use text_generation_router::{Attention, FinishReason, PrefillToken, Token};
 use tokio::sync::mpsc::error::SendError;
 use tokio::sync::{mpsc, Notify};
 use tokio::time::Instant;
@ -36,11 +36,17 @@ impl BackendV3 {
        speculate: u32,
    ) -> Self {
        let attention = if let Ok(attention) = std::env::var("ATTENTION") {
-            attention.parse().unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
+            attention
                .parse()
                .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
        } else {
            Attention::Paged
        };
-        let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
+        let block_size = if attention == Attention::FlashDecoding {
            256
        } else {
            16
        };
        let queue = Queue::new(
            requires_padding,
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -2080,4 +2080,4 @@
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
-}
+}
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -9,7 +9,7 @@ We recommend using the official quantization scripts for creating your quants:
 2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
 3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
-For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest. 
+For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
 ## Quantization with bitsandbytes
@ -69,4 +69,4 @@ text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-
 You can learn more about the quantization options by running `text-generation-server quantize --help`.
 If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
-You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
+You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@ -1,11 +1,10 @@
 /// Batching and inference logic
 use crate::infer::v2::queue::{Entry, Queue};
 use crate::infer::{
-    Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
+    Attention, Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
    Attention,
 };
 use crate::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token, Attention};
+use crate::{Attention, FinishReason, PrefillToken, Token};
 use nohash_hasher::IntMap;
 use std::sync::{
    atomic::{AtomicBool, Ordering},
@ -42,11 +41,17 @@ impl BackendV2 {
    ) -> Self {
        // Infer shared state
        let attention = if let Ok(attention) = std::env::var("ATTENTION") {
-            attention.parse().expect(&format!("Invalid attention was specified :`{attention}`"))
+            attention
                .parse()
                .expect(&format!("Invalid attention was specified :`{attention}`"))
        } else {
            Attention::Paged
        };
-        let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
+        let block_size = if attention == Attention::FlashDecoding {
            256
        } else {
            16
        };
        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
        let batching_task_notifier = Arc::new(Notify::new());
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -16,7 +16,7 @@ use utoipa::ToSchema;
 use validation::Validation;
 #[derive(PartialEq)]
-pub enum Attention{
+pub enum Attention {
    Paged,
    FlashDecoding,
    FlashInfer,
@ -25,21 +25,21 @@ pub enum Attention{
 #[derive(Debug)]
 pub struct ParseError;
-impl std::fmt::Display for ParseError{
+impl std::fmt::Display for ParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Cannot parse attention value")
    }
 }
-impl std::error::Error for ParseError{}
+impl std::error::Error for ParseError {}
-impl std::str::FromStr for Attention{
+impl std::str::FromStr for Attention {
    type Err = ParseError;
-    fn from_str(s: &str) -> Result<Self, Self::Err>{
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s{
+        match s {
            "paged" => Ok(Attention::Paged),
            "flashdecoding" => Ok(Attention::FlashDecoding),
            "flashinfer" => Ok(Attention::FlashInfer),
-            _ => Err(ParseError)
+            _ => Err(ParseError),
        }
    }
 }
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -6,8 +6,10 @@ from typing import Dict, Optional
 from text_generation_server.utils.log import log_master
 ATTENTION = os.getenv("ATTENTION", "paged")
-_expected  = {"paged", "flashdecoding", "flashinfer"}
+_expected = {"paged", "flashdecoding", "flashinfer"}
-assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
+assert (
    ATTENTION in _expected
 ), f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None