Using flash decoding

Conditional flashdecoding.

Fix max_q.

Working kvcache

Working version with flash decoding.

Make it work for mistral.

Fix after rebase..

Less intrusive.

REvert changes in modeling.

Speedup flashdecoding.

HHachweew
Hack to make other models work.

Fixing non flash decoding llama path.

Router logic knows about page size.

Missing 2 models.

Missing cohere.

Fixing cohere flash decoding.

Revamped all this architecture.

Fix cohere.

Fixing falcon.

Enabling custom block size schedule.

Update router/src/infer.rs

Not sending preallocated output.
This commit is contained in:
Nicolas Patry 2024-05-17 08:43:33 +00:00
parent d0225b1015
commit 4293a12863
23 changed files with 1010 additions and 64 deletions

View File

@ -0,0 +1,647 @@
// This file is @generated by prost-build.
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthResponse {}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoResponse {
#[prost(bool, tag = "1")]
pub requires_padding: bool,
#[prost(string, tag = "2")]
pub dtype: ::prost::alloc::string::String,
#[prost(string, tag = "3")]
pub device_type: ::prost::alloc::string::String,
#[prost(uint32, optional, tag = "4")]
pub window_size: ::core::option::Option<u32>,
#[prost(uint32, tag = "5")]
pub speculate: u32,
}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryResponse {
/// / Other shards urls
#[prost(string, repeated, tag = "1")]
pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheRequest {
/// / Optional batch id
#[prost(uint64, optional, tag = "1")]
pub id: ::core::option::Option<u64>,
}
/// / Empty response
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheResponse {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct NextTokenChooserParameters {
/// / exponential scaling output probability distribution
#[prost(float, tag = "1")]
pub temperature: f32,
/// / restricting to the k highest probability elements
#[prost(uint32, tag = "2")]
pub top_k: u32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "3")]
pub top_p: f32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "4")]
pub typical_p: f32,
/// / apply sampling on the logits
#[prost(bool, tag = "5")]
pub do_sample: bool,
/// / random seed for sampling
#[prost(uint64, tag = "6")]
pub seed: u64,
/// / repetition penalty
#[prost(float, tag = "7")]
pub repetition_penalty: f32,
/// / frequency penalty
#[prost(float, tag = "9")]
pub frequency_penalty: f32,
/// / token watermarking using "A Watermark for Large Language Models"
#[prost(bool, tag = "8")]
pub watermark: bool,
/// / grammar (applied if not empty)
#[prost(string, tag = "10")]
pub grammar: ::prost::alloc::string::String,
/// / grammar type
#[prost(enumeration = "GrammarType", tag = "11")]
pub grammar_type: i32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StoppingCriteriaParameters {
/// / Maximum number of generated tokens
#[prost(uint32, tag = "1")]
pub max_new_tokens: u32,
/// / Optional stopping sequences
#[prost(string, repeated, tag = "2")]
pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / Ignore end of sequence token
/// / used for benchmarking
#[prost(bool, tag = "3")]
pub ignore_eos_token: bool,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Request {
/// / Request ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / The generation context
#[prost(string, tag = "2")]
pub inputs: ::prost::alloc::string::String,
/// / Context truncation
#[prost(uint32, tag = "3")]
pub truncate: u32,
/// / Next Token Chooser Parameters
#[prost(message, optional, tag = "4")]
pub parameters: ::core::option::Option<NextTokenChooserParameters>,
/// / Stopping Criteria Parameters
#[prost(message, optional, tag = "5")]
pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
/// / Return prefill logprobs
#[prost(bool, tag = "6")]
pub prefill_logprobs: bool,
/// / Return most likely n tokens
#[prost(uint32, tag = "7")]
pub top_n_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Batch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests
#[prost(message, repeated, tag = "2")]
pub requests: ::prost::alloc::vec::Vec<Request>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct CachedBatch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests ids
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct GeneratedText {
/// / Output
#[prost(string, tag = "1")]
pub text: ::prost::alloc::string::String,
/// / Number of generated tokens
#[prost(uint32, tag = "2")]
pub generated_tokens: u32,
/// / Finish reason
#[prost(enumeration = "FinishReason", tag = "3")]
pub finish_reason: i32,
/// / Seed
#[prost(uint64, optional, tag = "4")]
pub seed: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Tokens {
/// / Token IDs
#[prost(uint32, repeated, tag = "1")]
pub ids: ::prost::alloc::vec::Vec<u32>,
/// / Logprobs
#[prost(float, repeated, tag = "2")]
pub logprobs: ::prost::alloc::vec::Vec<f32>,
/// / tokens
#[prost(string, repeated, tag = "3")]
pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / special
#[prost(bool, repeated, tag = "4")]
pub is_special: ::prost::alloc::vec::Vec<bool>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Generation {
/// / Request ID
#[prost(uint64, tag = "1")]
pub request_id: u64,
/// / Prefill tokens (optional)
#[prost(message, optional, tag = "2")]
pub prefill_tokens: ::core::option::Option<Tokens>,
#[prost(message, optional, tag = "3")]
pub tokens: ::core::option::Option<Tokens>,
/// / Complete generated text
#[prost(message, optional, tag = "4")]
pub generated_text: ::core::option::Option<GeneratedText>,
/// / Top tokens
#[prost(message, repeated, tag = "5")]
pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchRequest {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub batch_id: u64,
/// / Requests to keep
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchResponse {
/// / Filtered Batch (cached)
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillRequest {
/// / Batch
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillResponse {
/// / Generation
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeRequest {
/// / Cached batches
#[prost(message, repeated, tag = "1")]
pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeResponse {
/// / Decodes
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
/// / Concatenate elapsed time in nanoseconds
#[prost(uint64, optional, tag = "6")]
pub concat_ns: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupRequest {
/// / Batch to warmup on
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
#[prost(uint32, tag = "2")]
pub max_input_length: u32,
#[prost(uint32, tag = "3")]
pub max_prefill_tokens: u32,
#[prost(uint32, tag = "4")]
pub max_total_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupResponse {
/// / Maximum number of tokens supported by the model
#[prost(uint32, optional, tag = "1")]
pub max_supported_total_tokens: ::core::option::Option<u32>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum GrammarType {
None = 0,
Json = 1,
Regex = 2,
}
impl GrammarType {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
GrammarType::None => "GRAMMAR_TYPE_NONE",
GrammarType::Json => "GRAMMAR_TYPE_JSON",
GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"GRAMMAR_TYPE_NONE" => Some(Self::None),
"GRAMMAR_TYPE_JSON" => Some(Self::Json),
"GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
_ => None,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum FinishReason {
Length = 0,
EosToken = 1,
StopSequence = 2,
}
impl FinishReason {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
FinishReason::Length => "FINISH_REASON_LENGTH",
FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"FINISH_REASON_LENGTH" => Some(Self::Length),
"FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
"FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
_ => None,
}
}
}
/// Generated client implementations.
pub mod text_generation_service_client {
#![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
use tonic::codegen::*;
use tonic::codegen::http::Uri;
#[derive(Debug, Clone)]
pub struct TextGenerationServiceClient<T> {
inner: tonic::client::Grpc<T>,
}
impl TextGenerationServiceClient<tonic::transport::Channel> {
/// Attempt to create a new client by connecting to a given endpoint.
pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
where
D: TryInto<tonic::transport::Endpoint>,
D::Error: Into<StdError>,
{
let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
Ok(Self::new(conn))
}
}
impl<T> TextGenerationServiceClient<T>
where
T: tonic::client::GrpcService<tonic::body::BoxBody>,
T::Error: Into<StdError>,
T::ResponseBody: Body<Data = Bytes> + Send + 'static,
<T::ResponseBody as Body>::Error: Into<StdError> + Send,
{
pub fn new(inner: T) -> Self {
let inner = tonic::client::Grpc::new(inner);
Self { inner }
}
pub fn with_origin(inner: T, origin: Uri) -> Self {
let inner = tonic::client::Grpc::with_origin(inner, origin);
Self { inner }
}
pub fn with_interceptor<F>(
inner: T,
interceptor: F,
) -> TextGenerationServiceClient<InterceptedService<T, F>>
where
F: tonic::service::Interceptor,
T::ResponseBody: Default,
T: tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
Response = http::Response<
<T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
>,
>,
<T as tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
>>::Error: Into<StdError> + Send + Sync,
{
TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
}
/// Compress requests with the given encoding.
///
/// This requires the server to support it otherwise it might respond with an
/// error.
#[must_use]
pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.send_compressed(encoding);
self
}
/// Enable decompressing responses.
#[must_use]
pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.accept_compressed(encoding);
self
}
/// Limits the maximum size of a decoded message.
///
/// Default: `4MB`
#[must_use]
pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_decoding_message_size(limit);
self
}
/// Limits the maximum size of an encoded message.
///
/// Default: `usize::MAX`
#[must_use]
pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_encoding_message_size(limit);
self
}
/// / Model Info
pub async fn info(
&mut self,
request: impl tonic::IntoRequest<super::InfoRequest>,
) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Info",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
self.inner.unary(req, path, codec).await
}
/// / Service discovery
pub async fn service_discovery(
&mut self,
request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
) -> std::result::Result<
tonic::Response<super::ServiceDiscoveryResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ServiceDiscovery",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new(
"generate.v2.TextGenerationService",
"ServiceDiscovery",
),
);
self.inner.unary(req, path, codec).await
}
/// / Empties batch cache
pub async fn clear_cache(
&mut self,
request: impl tonic::IntoRequest<super::ClearCacheRequest>,
) -> std::result::Result<
tonic::Response<super::ClearCacheResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ClearCache",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
);
self.inner.unary(req, path, codec).await
}
/// / Remove requests from a cached batch
pub async fn filter_batch(
&mut self,
request: impl tonic::IntoRequest<super::FilterBatchRequest>,
) -> std::result::Result<
tonic::Response<super::FilterBatchResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/FilterBatch",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
);
self.inner.unary(req, path, codec).await
}
/// / Warmup the model and compute max cache size
pub async fn warmup(
&mut self,
request: impl tonic::IntoRequest<super::WarmupRequest>,
) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Warmup",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
self.inner.unary(req, path, codec).await
}
/// / Prefill batch and decode first token
pub async fn prefill(
&mut self,
request: impl tonic::IntoRequest<super::PrefillRequest>,
) -> std::result::Result<
tonic::Response<super::PrefillResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Prefill",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
self.inner.unary(req, path, codec).await
}
/// / Decode token for a list of prefilled batches
pub async fn decode(
&mut self,
request: impl tonic::IntoRequest<super::DecodeRequest>,
) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Decode",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
self.inner.unary(req, path, codec).await
}
/// / Health check
pub async fn health(
&mut self,
request: impl tonic::IntoRequest<super::HealthRequest>,
) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Health",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
self.inner.unary(req, path, codec).await
}
}
}

View File

@ -0,0 +1,6 @@
// This file is @generated by prost-build.
pub mod generate {
pub mod v2 {
include!("generate.v2.rs");
}
}

View File

@ -39,8 +39,25 @@ impl SchedulerV2 {
speculate: u32, speculate: u32,
generation_health: Arc<AtomicBool>, generation_health: Arc<AtomicBool>,
) -> Self { ) -> Self {
<<<<<<< HEAD:router/src/infer/v2/scheduler.rs
let queue = Queue::new(requires_padding, 16, window_size, speculate); let queue = Queue::new(requires_padding, 16, window_size, speculate);
let batching_task_notifier = Arc::new(Notify::new()); let batching_task_notifier = Arc::new(Notify::new());
=======
// Infer shared state
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
} else {
false
};
let block_size = if flashdecoding { 256 } else { 16 };
let block_size = std::env::var("BLOCK_SIZE")
.map(|b| b.parse().unwrap_or(block_size))
.unwrap_or(block_size);
let queue = Queue::new(requires_padding, block_size, window_size, speculate);
let shared = Arc::new(Shared {
batching_task: Notify::new(),
});
>>>>>>> Using flash decoding:router/src/infer.rs
// Spawn batching background task that contains all the inference logic // Spawn batching background task that contains all the inference logic
tokio::spawn(batching_task( tokio::spawn(batching_task(

View File

@ -1,5 +1,6 @@
import torch import torch
from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.models.globals import FLASH_DECODING
major, minor = torch.cuda.get_device_capability() major, minor = torch.cuda.get_device_capability()
is_sm75 = major == 7 and minor == 5 is_sm75 = major == 7 and minor == 5
@ -21,7 +22,14 @@ def reshape_and_cache(
value_cache: torch.Tensor, value_cache: torch.Tensor,
slots: torch.Tensor, slots: torch.Tensor,
): ):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0) if FLASH_DECODING:
shape = key_cache.shape
key_cache.view(-1, shape[-2], shape[-1])[slots] = key
value_cache.view(-1, shape[-2], shape[-1])[slots] = value
else:
cache_ops.reshape_and_cache(
key, value, key_cache, value_cache, slots, "auto", 1.0
)
def paged_attention( def paged_attention(
@ -32,7 +40,8 @@ def paged_attention(
kv_head_mapping: torch.Tensor, kv_head_mapping: torch.Tensor,
softmax_scale: float, softmax_scale: float,
block_tables: torch.Tensor, block_tables: torch.Tensor,
input_lengths: torch.Tensor, cu_seqlen_q: torch.Tensor,
cu_seqlen_k: torch.Tensor,
max_s: int, max_s: int,
): ):
# Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@ -56,64 +65,100 @@ def paged_attention(
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs, num_heads, head_size = query.shape num_seqs, num_heads, head_size = query.shape
max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
input_lengths = cu_seqlen_k
# NOTE(woosuk): We use a simple heuristic to decide whether to use # NOTE(woosuk): We use a simple heuristic to decide whether to use
# PagedAttention V1 or V2. If the number of partitions is 1, we use # PagedAttention V1 or V2. If the number of partitions is 1, we use
# V1 to avoid the overhead of reduction. Also, if the number of # V1 to avoid the overhead of reduction. Also, if the number of
# sequences or heads is large, we use V1 since there is enough work # sequences or heads is large, we use V1 since there is enough work
# to parallelize. # to parallelize.
from vllm._C import ops if FLASH_DECODING:
max_q = 1
max_k = max_s
import flash_attn_2_cuda
use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512) # TODO fixme when flash contains the fix.
if use_v1: # Number of splits is not correctly handled
ops.paged_attention_v1( # by the current path
out, # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
# This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
out2 = flash_attn_2_cuda.varlen_fwd(
query, query,
key_cache, key_cache,
value_cache, value_cache,
kv_head_mapping,
softmax_scale,
block_tables,
input_lengths,
block_size,
max_s,
None, None,
"auto", cu_seqlen_k,
1.0, cu_seqlen_k,
None,
block_tables,
None,
max_q,
max_k,
0.0, # dropout
softmax_scale,
False, # zero_tensors
True, # causal
-1, # Window_left
-1, # Window right
False, # return softmax
None, # generator
) )
return out2[0]
else: else:
# Run PagedAttention V2. from vllm._C import ops
assert _PARTITION_SIZE % block_size == 0
tmp_output = torch.empty(
size=(num_seqs, num_heads, max_num_partitions, head_size),
dtype=out.dtype,
device=out.device,
)
exp_sums = torch.empty(
size=(num_seqs, num_heads, max_num_partitions),
dtype=torch.float32,
device=out.device,
)
max_logits = torch.empty_like(exp_sums)
ops.paged_attention_v2( use_v1 = max_s <= 8192 and (
out, max_num_partitions == 1 or num_seqs * num_heads > 512
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
kv_head_mapping,
softmax_scale,
block_tables,
input_lengths,
block_size,
max_s,
None,
"auto",
1.0,
) )
if use_v1:
ops.paged_attention_v1(
out,
query,
key_cache,
value_cache,
kv_head_mapping,
softmax_scale,
block_tables,
input_lengths,
block_size,
max_s,
None,
"auto",
1.0,
)
else:
# Run PagedAttention V2.
assert _PARTITION_SIZE % block_size == 0
tmp_output = torch.empty(
size=(num_seqs, num_heads, max_num_partitions, head_size),
dtype=out.dtype,
device=out.device,
)
exp_sums = torch.empty(
size=(num_seqs, num_heads, max_num_partitions),
dtype=torch.float32,
device=out.device,
)
max_logits = torch.empty_like(exp_sums)
ops.paged_attention_v2(
out,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
kv_head_mapping,
softmax_scale,
block_tables,
input_lengths,
block_size,
max_s,
None,
"auto",
1.0,
)
try: try:

View File

@ -55,7 +55,8 @@ def paged_attention(
kv_head_mapping: torch.Tensor, kv_head_mapping: torch.Tensor,
softmax_scale: float, softmax_scale: float,
block_tables: torch.Tensor, block_tables: torch.Tensor,
input_lengths: torch.Tensor, cu_seqlen_q: torch.Tensor,
cu_seqlen_k: torch.Tensor,
max_s: int, max_s: int,
): ):
return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
@ -66,7 +67,7 @@ def paged_attention(
kv_head_mapping, kv_head_mapping,
softmax_scale, softmax_scale,
block_tables, block_tables,
input_lengths, cu_seqlen_q,
BLOCK_SIZE, BLOCK_SIZE,
max_s, max_s,
None, None,

View File

@ -1,6 +1,7 @@
import os import os
import torch import torch
from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.models.globals import FLASH_DECODING
from loguru import logger from loguru import logger
major, minor = torch.cuda.get_device_capability() major, minor = torch.cuda.get_device_capability()
@ -26,7 +27,14 @@ def reshape_and_cache(
value_cache: torch.Tensor, value_cache: torch.Tensor,
slots: torch.Tensor, slots: torch.Tensor,
): ):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0) if FLASH_DECODING:
shape = key_cache.shape
key_cache.view(-1, shape[-2], shape[-1])[slots] = key
value_cache.view(-1, shape[-2], shape[-1])[slots] = value
else:
cache_ops.reshape_and_cache(
key, value, key_cache, value_cache, slots, "auto", 1.0
)
def paged_attention( def paged_attention(
@ -37,7 +45,8 @@ def paged_attention(
kv_head_mapping: torch.Tensor, kv_head_mapping: torch.Tensor,
softmax_scale: float, softmax_scale: float,
block_tables: torch.Tensor, block_tables: torch.Tensor,
input_lengths: torch.Tensor, cu_seqlen_q: torch.Tensor,
cu_seqlen_k: torch.Tensor,
max_s: int, max_s: int,
): ):
# Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@ -61,6 +70,7 @@ def paged_attention(
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs, num_heads, head_size = query.shape num_seqs, num_heads, head_size = query.shape
max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
input_lengths = cu_seqlen_k
# NOTE(woosuk): We use a simple heuristic to decide whether to use # NOTE(woosuk): We use a simple heuristic to decide whether to use
# PagedAttention V1 or V2. If the number of partitions is 1, we use # PagedAttention V1 or V2. If the number of partitions is 1, we use

View File

@ -0,0 +1,158 @@
import math
import torch
from typing import Optional, List, Tuple
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.models.globals import FLASH_DECODING
BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
# Will be set in warmup
CACHE_MANAGER: Optional["CacheManager"] = None
class CacheManager:
def __init__(
self,
num_blocks: int,
num_layers: int,
num_heads: int,
head_size: int,
repeat_slots: bool,
dtype: torch.dtype,
device: torch.device,
):
self.block_size = BLOCK_SIZE
self.num_blocks = num_blocks
self.repeat_slots = repeat_slots
element_size = torch.tensor([], dtype=dtype).element_size()
if SYSTEM == "xpu":
x = 1
else:
x = self.block_size // element_size
if FLASH_DECODING:
self.kv_cache = [
(
torch.empty(
(num_blocks, self.block_size, num_heads, head_size),
dtype=dtype,
device=device,
),
torch.empty(
(num_blocks, self.block_size, num_heads, head_size),
dtype=dtype,
device=device,
),
)
for _ in range(num_layers)
]
else:
self.kv_cache = [
(
torch.empty(
(num_blocks, num_heads, head_size // x, self.block_size, x),
dtype=dtype,
device=device,
),
torch.empty(
(num_blocks, num_heads, head_size, self.block_size),
dtype=dtype,
device=device,
),
)
for _ in range(num_layers)
]
self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
self.slots = torch.arange(
0, num_blocks * self.block_size, dtype=torch.int64
).view(num_blocks, self.block_size)
def allocate(
self,
needed_blocks_slots: List[Tuple[int, int]],
blocks: int,
max_blocks: int,
device: torch.device,
):
# Get free blocks indices by finding values in mask that are not set to 0
free_block_indices = self.free_block_mask.nonzero()
if blocks > len(free_block_indices):
raise RuntimeError(
f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
)
# Slice by the number of required blocks
block_indices = free_block_indices[:blocks]
block_indices = block_indices.flatten()
# Padded block tables
block_tables_tensor = torch.zeros(
(len(needed_blocks_slots), max_blocks), dtype=torch.int32
)
# Allocate paged attention blocks
cumulative_blocks = 0
slots = []
block_tables = []
for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
# Get allocated blocks for this sequence
allocated_blocks = block_indices[
cumulative_blocks : cumulative_blocks + needed_blocks
]
# Get slots for the allocated blocks
all_slots = self.slots[allocated_blocks].flatten()
# Repeat slots in the case of context sliding window
if needed_slots > len(all_slots) and self.repeat_slots:
repeats = math.ceil(needed_slots / len(all_slots))
all_slots = all_slots.repeat(repeats)
allocated_slots = all_slots[:needed_slots]
slots.append(allocated_slots)
block_tables.append(allocated_blocks.tolist())
block_tables_tensor[i, :needed_blocks] = allocated_blocks
cumulative_blocks += needed_blocks
block_tables = block_tables
block_tables_tensor = block_tables_tensor.to(device)
slots = torch.concat(slots).to(device)
# Allocate the required number of blocks by setting the mask to 0
self.free_block_mask[block_indices] = 0
return block_tables, block_tables_tensor, slots
def free(self, block_indices: Optional[List[int]]):
if block_indices is not None and block_indices:
# Reset mask
self.free_block_mask[block_indices] = 1
def set_cache_manager(
num_blocks: int,
num_layers: int,
num_heads: int,
head_size: int,
repeat_slots: bool,
dtype: torch.dtype,
device: torch.device,
) -> CacheManager:
global CACHE_MANAGER
if CACHE_MANAGER is not None:
del CACHE_MANAGER
torch.cuda.empty_cache()
CACHE_MANAGER = CacheManager(
num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
)
return CACHE_MANAGER
def get_cache_manager() -> CacheManager:
global CACHE_MANAGER
if CACHE_MANAGER is None:
raise RuntimeError("cache manager was not initialized")
return CACHE_MANAGER

View File

@ -30,6 +30,7 @@ from text_generation_server.layers.attention import (
attention, attention,
reshape_and_cache, reshape_and_cache,
) )
from text_generation_server.models.globals import FLASH_DECODING
from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers import ( from text_generation_server.layers import (
TensorParallelRowLinear, TensorParallelRowLinear,
@ -259,8 +260,9 @@ class FlashCohereAttention(torch.nn.Module):
cu_seqlen_prefill, cu_seqlen_prefill,
kv_cache, kv_cache,
block_tables, block_tables,
cu_seqlen_q,
cu_seqlen_k,
slots, slots,
input_lengths,
max_s, max_s,
): ):
qkv = self.query_key_value(hidden_states) qkv = self.query_key_value(hidden_states)
@ -312,7 +314,8 @@ class FlashCohereAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
) )
@ -386,8 +389,9 @@ class FlashCohereLayer(nn.Module):
cu_seqlen_prefill, cu_seqlen_prefill,
kv_cache, kv_cache,
block_tables, block_tables,
cu_seqlen_q,
cu_seqlen_k,
slots, slots,
input_lengths,
max_s, max_s,
): ):
normed_hidden_states, res = self.input_layernorm(hidden_states, residual) normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
@ -400,8 +404,9 @@ class FlashCohereLayer(nn.Module):
cu_seqlen_prefill, cu_seqlen_prefill,
kv_cache, kv_cache,
block_tables, block_tables,
cu_seqlen_q,
cu_seqlen_k,
slots, slots,
input_lengths,
max_s, max_s,
) )
@ -464,6 +469,24 @@ class FlashCohereModel(torch.nn.Module):
) )
residual = None residual = None
if cu_seqlen_prefill is None and FLASH_DECODING:
cu_seqlen_q = torch.arange(
input_lengths.shape[0] + 1,
device=input_ids.device,
dtype=torch.int32,
)
cu_seqlen_k = torch.cat(
[
torch.zeros(
(1,), device=input_lengths.device, dtype=input_lengths.dtype
),
input_lengths.cumsum(dim=-1),
]
).to(dtype=torch.int32)
else:
cu_seqlen_q = None
cu_seqlen_k = input_lengths
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
hidden_states, residual = layer( hidden_states, residual = layer(
hidden_states, hidden_states,
@ -473,8 +496,9 @@ class FlashCohereModel(torch.nn.Module):
cu_seqlen_prefill, cu_seqlen_prefill,
kv_cache[i], kv_cache[i],
block_tables, block_tables,
cu_seqlen_q,
cu_seqlen_k,
slots, slots,
input_lengths,
max_s, max_s,
) )

View File

@ -344,6 +344,7 @@ class DbrxAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -253,6 +253,7 @@ class FlashGemmaAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -253,6 +253,7 @@ class FlashGPT2Attention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -33,6 +33,7 @@ from text_generation_server.layers.attention import (
attention, attention,
reshape_and_cache, reshape_and_cache,
) )
from text_generation_server.models.globals import FLASH_DECODING
from text_generation_server.layers import ( from text_generation_server.layers import (
TensorParallelRowLinear, TensorParallelRowLinear,
TensorParallelColumnLinear, TensorParallelColumnLinear,
@ -172,7 +173,8 @@ class FlashLlamaAttention(torch.nn.Module):
kv_cache, kv_cache,
block_tables, block_tables,
slots, slots,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
adapter_data, adapter_data,
): ):
@ -192,10 +194,10 @@ class FlashLlamaAttention(torch.nn.Module):
reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots) reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
# output tensor # output tensor
attn_output = torch.empty_like(query)
# Prefill # Prefill
if cu_seqlen_prefill is not None: if cu_seqlen_prefill is not None:
attn_output = torch.empty_like(query)
# flash attention # flash attention
attention( attention(
query, query,
@ -208,15 +210,16 @@ class FlashLlamaAttention(torch.nn.Module):
) )
# Decode # Decode
else: else:
paged_attention( attn_output = paged_attention(
attn_output, None,
query, query,
kv_cache[0], kv_cache[0],
kv_cache[1], kv_cache[1],
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
) )
@ -353,7 +356,8 @@ class FlashLlamaLayer(nn.Module):
kv_cache, kv_cache,
block_tables, block_tables,
slots, slots,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
adapter_data, adapter_data,
): ):
@ -368,7 +372,8 @@ class FlashLlamaLayer(nn.Module):
kv_cache, kv_cache,
block_tables, block_tables,
slots, slots,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
adapter_data, adapter_data,
) )
@ -438,6 +443,23 @@ class FlashLlamaModel(torch.nn.Module):
cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin( cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
position_ids, max_s, hidden_states.dtype position_ids, max_s, hidden_states.dtype
) )
if cu_seqlen_prefill is None and FLASH_DECODING:
cu_seqlen_q = torch.arange(
input_lengths.shape[0] + 1,
device=inputs_embeds.device,
dtype=torch.int32,
)
cu_seqlen_k = torch.cat(
[
torch.zeros(
(1,), device=input_lengths.device, dtype=input_lengths.dtype
),
input_lengths.cumsum(dim=-1),
]
).to(dtype=torch.int32)
else:
cu_seqlen_q = None
cu_seqlen_k = input_lengths
residual = None residual = None
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
@ -450,7 +472,8 @@ class FlashLlamaModel(torch.nn.Module):
kv_cache[i], kv_cache[i],
block_tables, block_tables,
slots, slots,
input_lengths, cu_seqlen_q,
cu_seqlen_k,
max_s, max_s,
adapter_data, adapter_data,
) )

View File

@ -237,6 +237,7 @@ class MistralAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -299,6 +299,7 @@ class MixtralAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -176,6 +176,7 @@ class FlashNeoxAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -215,6 +215,7 @@ class FlashPhiAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -157,6 +157,7 @@ class Qwen2Attention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -225,6 +225,7 @@ class FlashRWAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )
@ -348,6 +349,7 @@ class FlashRWLargeAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -309,6 +309,7 @@ class FlashMQAttention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -263,6 +263,7 @@ class Starcoder2Attention(torch.nn.Module):
self.kv_head_mapping, self.kv_head_mapping,
self.softmax_scale, self.softmax_scale,
block_tables, block_tables,
None,
input_lengths, input_lengths,
max_s, max_s,
) )

View File

@ -46,7 +46,7 @@ from text_generation_server.utils.import_utils import (
tracer = trace.get_tracer(__name__) tracer = trace.get_tracer(__name__)
BLOCK_SIZE: int = 16 BLOCK_SIZE: int = 256 if os.getenv("FLASH_DECODING", "").lower() in {"1", "true"} else 16
# Will be set in init # Will be set in init
SLIDING_WINDOW: Optional[int] = None SLIDING_WINDOW: Optional[int] = None

View File

@ -6,6 +6,9 @@ from typing import Dict
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
# This is overridden by the cli # This is overridden by the cli
cuda_graphs = os.getenv("CUDA_GRAPHS") cuda_graphs = os.getenv("CUDA_GRAPHS")
FLASH_DECODING = os.getenv("FLASH_DECODING") in {"1", "true", "True"}
if FLASH_DECODING:
logger.info("Using FLASH_DECODING")
if cuda_graphs is not None: if cuda_graphs is not None:
try: try:
cuda_graphs = [int(item) for item in cuda_graphs.split(",")] cuda_graphs = [int(item) for item in cuda_graphs.split(",")]

View File

@ -67,7 +67,7 @@ elif is_ipex_available():
synchronize = noop synchronize = noop
get_free_memory = get_cpu_free_memory get_free_memory = get_cpu_free_memory
else: else:
SYSTEM = "cpu" SYSTEM = "ipex"
empty_cache = noop empty_cache = noop
synchronize = noop synchronize = noop