mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
Fix clippy and fmt.
This commit is contained in:
parent
379e1659a9
commit
a4b1806557
@ -6,7 +6,7 @@ use nohash_hasher::IntMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
|
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
|
||||||
use text_generation_router::validation::ValidGenerateRequest;
|
use text_generation_router::validation::ValidGenerateRequest;
|
||||||
use text_generation_router::{FinishReason, PrefillToken, Token, Attention};
|
use text_generation_router::{Attention, FinishReason, PrefillToken, Token};
|
||||||
use tokio::sync::mpsc::error::SendError;
|
use tokio::sync::mpsc::error::SendError;
|
||||||
use tokio::sync::{mpsc, Notify};
|
use tokio::sync::{mpsc, Notify};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
@ -36,11 +36,17 @@ impl BackendV3 {
|
|||||||
speculate: u32,
|
speculate: u32,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
||||||
attention.parse().unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
|
attention
|
||||||
|
.parse()
|
||||||
|
.unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
|
||||||
} else {
|
} else {
|
||||||
Attention::Paged
|
Attention::Paged
|
||||||
};
|
};
|
||||||
let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
|
let block_size = if attention == Attention::FlashDecoding {
|
||||||
|
256
|
||||||
|
} else {
|
||||||
|
16
|
||||||
|
};
|
||||||
|
|
||||||
let queue = Queue::new(
|
let queue = Queue::new(
|
||||||
requires_padding,
|
requires_padding,
|
||||||
|
@ -2080,4 +2080,4 @@
|
|||||||
"description": "Hugging Face Text Generation Inference API"
|
"description": "Hugging Face Text Generation Inference API"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ We recommend using the official quantization scripts for creating your quants:
|
|||||||
2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
|
2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
|
||||||
3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
|
3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)
|
||||||
|
|
||||||
For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
|
For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.
|
||||||
|
|
||||||
## Quantization with bitsandbytes
|
## Quantization with bitsandbytes
|
||||||
|
|
||||||
@ -69,4 +69,4 @@ text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-
|
|||||||
You can learn more about the quantization options by running `text-generation-server quantize --help`.
|
You can learn more about the quantization options by running `text-generation-server quantize --help`.
|
||||||
|
|
||||||
If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
|
If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
|
||||||
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
|
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
/// Batching and inference logic
|
/// Batching and inference logic
|
||||||
use crate::infer::v2::queue::{Entry, Queue};
|
use crate::infer::v2::queue::{Entry, Queue};
|
||||||
use crate::infer::{
|
use crate::infer::{
|
||||||
Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
|
Attention, Backend, GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse,
|
||||||
Attention,
|
|
||||||
};
|
};
|
||||||
use crate::validation::ValidGenerateRequest;
|
use crate::validation::ValidGenerateRequest;
|
||||||
use crate::{FinishReason, PrefillToken, Token, Attention};
|
use crate::{Attention, FinishReason, PrefillToken, Token};
|
||||||
use nohash_hasher::IntMap;
|
use nohash_hasher::IntMap;
|
||||||
use std::sync::{
|
use std::sync::{
|
||||||
atomic::{AtomicBool, Ordering},
|
atomic::{AtomicBool, Ordering},
|
||||||
@ -42,11 +41,17 @@ impl BackendV2 {
|
|||||||
) -> Self {
|
) -> Self {
|
||||||
// Infer shared state
|
// Infer shared state
|
||||||
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
let attention = if let Ok(attention) = std::env::var("ATTENTION") {
|
||||||
attention.parse().expect(&format!("Invalid attention was specified :`{attention}`"))
|
attention
|
||||||
|
.parse()
|
||||||
|
.expect(&format!("Invalid attention was specified :`{attention}`"))
|
||||||
} else {
|
} else {
|
||||||
Attention::Paged
|
Attention::Paged
|
||||||
};
|
};
|
||||||
let block_size = if attention == Attention::FlashDecoding { 256 } else { 16 };
|
let block_size = if attention == Attention::FlashDecoding {
|
||||||
|
256
|
||||||
|
} else {
|
||||||
|
16
|
||||||
|
};
|
||||||
let queue = Queue::new(requires_padding, block_size, window_size, speculate);
|
let queue = Queue::new(requires_padding, block_size, window_size, speculate);
|
||||||
let batching_task_notifier = Arc::new(Notify::new());
|
let batching_task_notifier = Arc::new(Notify::new());
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ use utoipa::ToSchema;
|
|||||||
use validation::Validation;
|
use validation::Validation;
|
||||||
|
|
||||||
#[derive(PartialEq)]
|
#[derive(PartialEq)]
|
||||||
pub enum Attention{
|
pub enum Attention {
|
||||||
Paged,
|
Paged,
|
||||||
FlashDecoding,
|
FlashDecoding,
|
||||||
FlashInfer,
|
FlashInfer,
|
||||||
@ -25,21 +25,21 @@ pub enum Attention{
|
|||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ParseError;
|
pub struct ParseError;
|
||||||
|
|
||||||
impl std::fmt::Display for ParseError{
|
impl std::fmt::Display for ParseError {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
write!(f, "Cannot parse attention value")
|
write!(f, "Cannot parse attention value")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl std::error::Error for ParseError{}
|
impl std::error::Error for ParseError {}
|
||||||
|
|
||||||
impl std::str::FromStr for Attention{
|
impl std::str::FromStr for Attention {
|
||||||
type Err = ParseError;
|
type Err = ParseError;
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err>{
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
match s{
|
match s {
|
||||||
"paged" => Ok(Attention::Paged),
|
"paged" => Ok(Attention::Paged),
|
||||||
"flashdecoding" => Ok(Attention::FlashDecoding),
|
"flashdecoding" => Ok(Attention::FlashDecoding),
|
||||||
"flashinfer" => Ok(Attention::FlashInfer),
|
"flashinfer" => Ok(Attention::FlashInfer),
|
||||||
_ => Err(ParseError)
|
_ => Err(ParseError),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,10 @@ from typing import Dict, Optional
|
|||||||
from text_generation_server.utils.log import log_master
|
from text_generation_server.utils.log import log_master
|
||||||
|
|
||||||
ATTENTION = os.getenv("ATTENTION", "paged")
|
ATTENTION = os.getenv("ATTENTION", "paged")
|
||||||
_expected = {"paged", "flashdecoding", "flashinfer"}
|
_expected = {"paged", "flashdecoding", "flashinfer"}
|
||||||
assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
|
assert (
|
||||||
|
ATTENTION in _expected
|
||||||
|
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
||||||
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
||||||
|
|
||||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||||
|
Loading…
Reference in New Issue
Block a user