Using flash decoding

Conditional flashdecoding. Fix max_q. Working kvcache Working version with flash decoding. Make it work for mistral. Fix after rebase.. Less intrusive. REvert changes in modeling. Speedup flashdecoding. HHachweew Hack to make other models work. Fixing non flash decoding llama path. Router logic knows about page size. Missing 2 models. Missing cohere. Fixing cohere flash decoding. Revamped all this architecture. Fix cohere. Fixing falcon. Enabling custom block size schedule. Update router/src/infer.rs Not sending preallocated output.
2025-09-12 04:44:52 +00:00 · 2024-05-17 08:43:33 +00:00 · 2024-05-17 08:43:33 +00:00 · 4293a12863
commit 4293a12863
parent d0225b1015
23 changed files with 1010 additions and 64 deletions
--- a/router/client/src/pb/generate.v2.rs
+++ b/router/client/src/pb/generate.v2.rs
@ -0,0 +1,647 @@
 // This file is @generated by prost-build.
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthResponse {}
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoResponse {
    #[prost(bool, tag = "1")]
    pub requires_padding: bool,
    #[prost(string, tag = "2")]
    pub dtype: ::prost::alloc::string::String,
    #[prost(string, tag = "3")]
    pub device_type: ::prost::alloc::string::String,
    #[prost(uint32, optional, tag = "4")]
    pub window_size: ::core::option::Option<u32>,
    #[prost(uint32, tag = "5")]
    pub speculate: u32,
 }
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryResponse {
    /// / Other shards urls
    #[prost(string, repeated, tag = "1")]
    pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheRequest {
    /// / Optional batch id
    #[prost(uint64, optional, tag = "1")]
    pub id: ::core::option::Option<u64>,
 }
 /// / Empty response
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheResponse {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct NextTokenChooserParameters {
    /// / exponential scaling output probability distribution
    #[prost(float, tag = "1")]
    pub temperature: f32,
    /// / restricting to the k highest probability elements
    #[prost(uint32, tag = "2")]
    pub top_k: u32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "3")]
    pub top_p: f32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "4")]
    pub typical_p: f32,
    /// / apply sampling on the logits
    #[prost(bool, tag = "5")]
    pub do_sample: bool,
    /// / random seed for sampling
    #[prost(uint64, tag = "6")]
    pub seed: u64,
    /// / repetition penalty
    #[prost(float, tag = "7")]
    pub repetition_penalty: f32,
    /// / frequency penalty
    #[prost(float, tag = "9")]
    pub frequency_penalty: f32,
    /// / token watermarking using "A Watermark for Large Language Models"
    #[prost(bool, tag = "8")]
    pub watermark: bool,
    /// / grammar (applied if not empty)
    #[prost(string, tag = "10")]
    pub grammar: ::prost::alloc::string::String,
    /// / grammar type
    #[prost(enumeration = "GrammarType", tag = "11")]
    pub grammar_type: i32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct StoppingCriteriaParameters {
    /// / Maximum number of generated tokens
    #[prost(uint32, tag = "1")]
    pub max_new_tokens: u32,
    /// / Optional stopping sequences
    #[prost(string, repeated, tag = "2")]
    pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / Ignore end of sequence token
    /// / used for benchmarking
    #[prost(bool, tag = "3")]
    pub ignore_eos_token: bool,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Request {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / The generation context
    #[prost(string, tag = "2")]
    pub inputs: ::prost::alloc::string::String,
    /// / Context truncation
    #[prost(uint32, tag = "3")]
    pub truncate: u32,
    /// / Next Token Chooser Parameters
    #[prost(message, optional, tag = "4")]
    pub parameters: ::core::option::Option<NextTokenChooserParameters>,
    /// / Stopping Criteria Parameters
    #[prost(message, optional, tag = "5")]
    pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
    /// / Return prefill logprobs
    #[prost(bool, tag = "6")]
    pub prefill_logprobs: bool,
    /// / Return most likely n tokens
    #[prost(uint32, tag = "7")]
    pub top_n_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Batch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests
    #[prost(message, repeated, tag = "2")]
    pub requests: ::prost::alloc::vec::Vec<Request>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct CachedBatch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests ids
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GeneratedText {
    /// / Output
    #[prost(string, tag = "1")]
    pub text: ::prost::alloc::string::String,
    /// / Number of generated tokens
    #[prost(uint32, tag = "2")]
    pub generated_tokens: u32,
    /// / Finish reason
    #[prost(enumeration = "FinishReason", tag = "3")]
    pub finish_reason: i32,
    /// / Seed
    #[prost(uint64, optional, tag = "4")]
    pub seed: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Tokens {
    /// / Token IDs
    #[prost(uint32, repeated, tag = "1")]
    pub ids: ::prost::alloc::vec::Vec<u32>,
    /// / Logprobs
    #[prost(float, repeated, tag = "2")]
    pub logprobs: ::prost::alloc::vec::Vec<f32>,
    /// / tokens
    #[prost(string, repeated, tag = "3")]
    pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / special
    #[prost(bool, repeated, tag = "4")]
    pub is_special: ::prost::alloc::vec::Vec<bool>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Generation {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub request_id: u64,
    /// / Prefill tokens (optional)
    #[prost(message, optional, tag = "2")]
    pub prefill_tokens: ::core::option::Option<Tokens>,
    #[prost(message, optional, tag = "3")]
    pub tokens: ::core::option::Option<Tokens>,
    /// / Complete generated text
    #[prost(message, optional, tag = "4")]
    pub generated_text: ::core::option::Option<GeneratedText>,
    /// / Top tokens
    #[prost(message, repeated, tag = "5")]
    pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchRequest {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub batch_id: u64,
    /// / Requests to keep
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchResponse {
    /// / Filtered Batch (cached)
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillRequest {
    /// / Batch
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillResponse {
    /// / Generation
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeRequest {
    /// / Cached batches
    #[prost(message, repeated, tag = "1")]
    pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeResponse {
    /// / Decodes
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
    /// / Concatenate elapsed time in nanoseconds
    #[prost(uint64, optional, tag = "6")]
    pub concat_ns: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupRequest {
    /// / Batch to warmup on
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
    #[prost(uint32, tag = "2")]
    pub max_input_length: u32,
    #[prost(uint32, tag = "3")]
    pub max_prefill_tokens: u32,
    #[prost(uint32, tag = "4")]
    pub max_total_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupResponse {
    /// / Maximum number of tokens supported by the model
    #[prost(uint32, optional, tag = "1")]
    pub max_supported_total_tokens: ::core::option::Option<u32>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum GrammarType {
    None = 0,
    Json = 1,
    Regex = 2,
 }
 impl GrammarType {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            GrammarType::None => "GRAMMAR_TYPE_NONE",
            GrammarType::Json => "GRAMMAR_TYPE_JSON",
            GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "GRAMMAR_TYPE_NONE" => Some(Self::None),
            "GRAMMAR_TYPE_JSON" => Some(Self::Json),
            "GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
            _ => None,
        }
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum FinishReason {
    Length = 0,
    EosToken = 1,
    StopSequence = 2,
 }
 impl FinishReason {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            FinishReason::Length => "FINISH_REASON_LENGTH",
            FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
            FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "FINISH_REASON_LENGTH" => Some(Self::Length),
            "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
            "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
            _ => None,
        }
    }
 }
 /// Generated client implementations.
 pub mod text_generation_service_client {
    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
    use tonic::codegen::*;
    use tonic::codegen::http::Uri;
    #[derive(Debug, Clone)]
    pub struct TextGenerationServiceClient<T> {
        inner: tonic::client::Grpc<T>,
    }
    impl TextGenerationServiceClient<tonic::transport::Channel> {
        /// Attempt to create a new client by connecting to a given endpoint.
        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
        where
            D: TryInto<tonic::transport::Endpoint>,
            D::Error: Into<StdError>,
        {
            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
            Ok(Self::new(conn))
        }
    }
    impl<T> TextGenerationServiceClient<T>
    where
        T: tonic::client::GrpcService<tonic::body::BoxBody>,
        T::Error: Into<StdError>,
        T::ResponseBody: Body<Data = Bytes> + Send + 'static,
        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
    {
        pub fn new(inner: T) -> Self {
            let inner = tonic::client::Grpc::new(inner);
            Self { inner }
        }
        pub fn with_origin(inner: T, origin: Uri) -> Self {
            let inner = tonic::client::Grpc::with_origin(inner, origin);
            Self { inner }
        }
        pub fn with_interceptor<F>(
            inner: T,
            interceptor: F,
        ) -> TextGenerationServiceClient<InterceptedService<T, F>>
        where
            F: tonic::service::Interceptor,
            T::ResponseBody: Default,
            T: tonic::codegen::Service<
                http::Request<tonic::body::BoxBody>,
                Response = http::Response<
                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
                >,
            >,
            <T as tonic::codegen::Service<
                http::Request<tonic::body::BoxBody>,
            >>::Error: Into<StdError> + Send + Sync,
        {
            TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
        }
        /// Compress requests with the given encoding.
        ///
        /// This requires the server to support it otherwise it might respond with an
        /// error.
        #[must_use]
        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.send_compressed(encoding);
            self
        }
        /// Enable decompressing responses.
        #[must_use]
        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.accept_compressed(encoding);
            self
        }
        /// Limits the maximum size of a decoded message.
        ///
        /// Default: `4MB`
        #[must_use]
        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_decoding_message_size(limit);
            self
        }
        /// Limits the maximum size of an encoded message.
        ///
        /// Default: `usize::MAX`
        #[must_use]
        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_encoding_message_size(limit);
            self
        }
        /// / Model Info
        pub async fn info(
            &mut self,
            request: impl tonic::IntoRequest<super::InfoRequest>,
        ) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/Info",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
            self.inner.unary(req, path, codec).await
        }
        /// / Service discovery
        pub async fn service_discovery(
            &mut self,
            request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
        ) -> std::result::Result<
            tonic::Response<super::ServiceDiscoveryResponse>,
            tonic::Status,
        > {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/ServiceDiscovery",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(
                    GrpcMethod::new(
                        "generate.v2.TextGenerationService",
                        "ServiceDiscovery",
                    ),
                );
            self.inner.unary(req, path, codec).await
        }
        /// / Empties batch cache
        pub async fn clear_cache(
            &mut self,
            request: impl tonic::IntoRequest<super::ClearCacheRequest>,
        ) -> std::result::Result<
            tonic::Response<super::ClearCacheResponse>,
            tonic::Status,
        > {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/ClearCache",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(
                    GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
                );
            self.inner.unary(req, path, codec).await
        }
        /// / Remove requests from a cached batch
        pub async fn filter_batch(
            &mut self,
            request: impl tonic::IntoRequest<super::FilterBatchRequest>,
        ) -> std::result::Result<
            tonic::Response<super::FilterBatchResponse>,
            tonic::Status,
        > {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/FilterBatch",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(
                    GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
                );
            self.inner.unary(req, path, codec).await
        }
        /// / Warmup the model and compute max cache size
        pub async fn warmup(
            &mut self,
            request: impl tonic::IntoRequest<super::WarmupRequest>,
        ) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/Warmup",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
            self.inner.unary(req, path, codec).await
        }
        /// / Prefill batch and decode first token
        pub async fn prefill(
            &mut self,
            request: impl tonic::IntoRequest<super::PrefillRequest>,
        ) -> std::result::Result<
            tonic::Response<super::PrefillResponse>,
            tonic::Status,
        > {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/Prefill",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
            self.inner.unary(req, path, codec).await
        }
        /// / Decode token for a list of prefilled batches
        pub async fn decode(
            &mut self,
            request: impl tonic::IntoRequest<super::DecodeRequest>,
        ) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/Decode",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
            self.inner.unary(req, path, codec).await
        }
        /// / Health check
        pub async fn health(
            &mut self,
            request: impl tonic::IntoRequest<super::HealthRequest>,
        ) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
            self.inner
                .ready()
                .await
                .map_err(|e| {
                    tonic::Status::new(
                        tonic::Code::Unknown,
                        format!("Service was not ready: {}", e.into()),
                    )
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/Health",
            );
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
            self.inner.unary(req, path, codec).await
        }
    }
 }
--- a/router/client/src/pb/mod.rs
+++ b/router/client/src/pb/mod.rs
@ -0,0 +1,6 @@
 // This file is @generated by prost-build.
 pub mod generate {
    pub mod v2 {
        include!("generate.v2.rs");
    }
 }
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@ -39,8 +39,25 @@ impl SchedulerV2 {
        speculate: u32,
        generation_health: Arc<AtomicBool>,
    ) -> Self {
 <<<<<<< HEAD:router/src/infer/v2/scheduler.rs
        let queue = Queue::new(requires_padding, 16, window_size, speculate);
        let batching_task_notifier = Arc::new(Notify::new());
 =======
        // Infer shared state
        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
        } else {
            false
        };
        let block_size = if flashdecoding { 256 } else { 16 };
        let block_size = std::env::var("BLOCK_SIZE")
            .map(|b| b.parse().unwrap_or(block_size))
            .unwrap_or(block_size);
        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
        let shared = Arc::new(Shared {
            batching_task: Notify::new(),
        });
 >>>>>>> Using flash decoding:router/src/infer.rs
        // Spawn batching background task that contains all the inference logic
        tokio::spawn(batching_task(
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -1,5 +1,6 @@
 import torch
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import FLASH_DECODING
 major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
@ -21,7 +22,14 @@ def reshape_and_cache(
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
-    cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+    if FLASH_DECODING:
        shape = key_cache.shape
        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
    else:
        cache_ops.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
 def paged_attention(
@ -32,7 +40,8 @@ def paged_attention(
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
-    input_lengths: torch.Tensor,
+    cu_seqlen_q: torch.Tensor,
    cu_seqlen_k: torch.Tensor,
    max_s: int,
 ):
    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@ -56,64 +65,100 @@ def paged_attention(
    block_size = value_cache.shape[3]
    num_seqs, num_heads, head_size = query.shape
    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
    input_lengths = cu_seqlen_k
    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
    # V1 to avoid the overhead of reduction. Also, if the number of
    # sequences or heads is large, we use V1 since there is enough work
    # to parallelize.
-    from vllm._C import ops
+    if FLASH_DECODING:
        max_q = 1
        max_k = max_s
        import flash_attn_2_cuda
-    use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
+        # TODO fixme when flash contains the fix.
-    if use_v1:
+        # Number of splits is not correctly handled
-        ops.paged_attention_v1(
+        # by the current path
-            out,
+        # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
        out2 = flash_attn_2_cuda.varlen_fwd(
            query,
            key_cache,
            value_cache,
            kv_head_mapping,
            softmax_scale,
            block_tables,
            input_lengths,
            block_size,
            max_s,
            None,
-            "auto",
+            cu_seqlen_k,
-            1.0,
+            cu_seqlen_k,
            None,
            block_tables,
            None,
            max_q,
            max_k,
            0.0,  # dropout
            softmax_scale,
            False,  # zero_tensors
            True,  # causal
            -1,  # Window_left
            -1,  # Window right
            False,  # return softmax
            None,  # generator
        )
        return out2[0]
    else:
-        # Run PagedAttention V2.
+        from vllm._C import ops
        assert _PARTITION_SIZE % block_size == 0
        tmp_output = torch.empty(
            size=(num_seqs, num_heads, max_num_partitions, head_size),
            dtype=out.dtype,
            device=out.device,
        )
        exp_sums = torch.empty(
            size=(num_seqs, num_heads, max_num_partitions),
            dtype=torch.float32,
            device=out.device,
        )
        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
+        use_v1 = max_s <= 8192 and (
-            out,
+            max_num_partitions == 1 or num_seqs * num_heads > 512
            exp_sums,
            max_logits,
            tmp_output,
            query,
            key_cache,
            value_cache,
            kv_head_mapping,
            softmax_scale,
            block_tables,
            input_lengths,
            block_size,
            max_s,
            None,
            "auto",
            1.0,
        )
        if use_v1:
            ops.paged_attention_v1(
                out,
                query,
                key_cache,
                value_cache,
                kv_head_mapping,
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                "auto",
                1.0,
            )
        else:
            # Run PagedAttention V2.
            assert _PARTITION_SIZE % block_size == 0
            tmp_output = torch.empty(
                size=(num_seqs, num_heads, max_num_partitions, head_size),
                dtype=out.dtype,
                device=out.device,
            )
            exp_sums = torch.empty(
                size=(num_seqs, num_heads, max_num_partitions),
                dtype=torch.float32,
                device=out.device,
            )
            max_logits = torch.empty_like(exp_sums)
            ops.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
                tmp_output,
                query,
                key_cache,
                value_cache,
                kv_head_mapping,
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                "auto",
                1.0,
            )
 try:
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -55,7 +55,8 @@ def paged_attention(
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
-    input_lengths: torch.Tensor,
+    cu_seqlen_q: torch.Tensor,
    cu_seqlen_k: torch.Tensor,
    max_s: int,
 ):
    return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
@ -66,7 +67,7 @@ def paged_attention(
        kv_head_mapping,
        softmax_scale,
        block_tables,
-        input_lengths,
+        cu_seqlen_q,
        BLOCK_SIZE,
        max_s,
        None,
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -1,6 +1,7 @@
 import os
 import torch
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import FLASH_DECODING
 from loguru import logger
 major, minor = torch.cuda.get_device_capability()
@ -26,7 +27,14 @@ def reshape_and_cache(
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
-    cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+    if FLASH_DECODING:
        shape = key_cache.shape
        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
    else:
        cache_ops.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
 def paged_attention(
@ -37,7 +45,8 @@ def paged_attention(
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
-    input_lengths: torch.Tensor,
+    cu_seqlen_q: torch.Tensor,
    cu_seqlen_k: torch.Tensor,
    max_s: int,
 ):
    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@ -61,6 +70,7 @@ def paged_attention(
    block_size = value_cache.shape[3]
    num_seqs, num_heads, head_size = query.shape
    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
    input_lengths = cu_seqlen_k
    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@ -0,0 +1,158 @@
 import math
 import torch
 from typing import Optional, List, Tuple
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import FLASH_DECODING
 BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
 # Will be set in warmup
 CACHE_MANAGER: Optional["CacheManager"] = None
 class CacheManager:
    def __init__(
        self,
        num_blocks: int,
        num_layers: int,
        num_heads: int,
        head_size: int,
        repeat_slots: bool,
        dtype: torch.dtype,
        device: torch.device,
    ):
        self.block_size = BLOCK_SIZE
        self.num_blocks = num_blocks
        self.repeat_slots = repeat_slots
        element_size = torch.tensor([], dtype=dtype).element_size()
        if SYSTEM == "xpu":
            x = 1
        else:
            x = self.block_size // element_size
        if FLASH_DECODING:
            self.kv_cache = [
                (
                    torch.empty(
                        (num_blocks, self.block_size, num_heads, head_size),
                        dtype=dtype,
                        device=device,
                    ),
                    torch.empty(
                        (num_blocks, self.block_size, num_heads, head_size),
                        dtype=dtype,
                        device=device,
                    ),
                )
                for _ in range(num_layers)
            ]
        else:
            self.kv_cache = [
                (
                    torch.empty(
                        (num_blocks, num_heads, head_size // x, self.block_size, x),
                        dtype=dtype,
                        device=device,
                    ),
                    torch.empty(
                        (num_blocks, num_heads, head_size, self.block_size),
                        dtype=dtype,
                        device=device,
                    ),
                )
                for _ in range(num_layers)
            ]
        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
        self.slots = torch.arange(
            0, num_blocks * self.block_size, dtype=torch.int64
        ).view(num_blocks, self.block_size)
    def allocate(
        self,
        needed_blocks_slots: List[Tuple[int, int]],
        blocks: int,
        max_blocks: int,
        device: torch.device,
    ):
        # Get free blocks indices by finding values in mask that are not set to 0
        free_block_indices = self.free_block_mask.nonzero()
        if blocks > len(free_block_indices):
            raise RuntimeError(
                f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
            )
        # Slice by the number of required blocks
        block_indices = free_block_indices[:blocks]
        block_indices = block_indices.flatten()
        # Padded block tables
        block_tables_tensor = torch.zeros(
            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
        )
        # Allocate paged attention blocks
        cumulative_blocks = 0
        slots = []
        block_tables = []
        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
            # Get allocated blocks for this sequence
            allocated_blocks = block_indices[
                cumulative_blocks : cumulative_blocks + needed_blocks
            ]
            # Get slots for the allocated blocks
            all_slots = self.slots[allocated_blocks].flatten()
            # Repeat slots in the case of context sliding window
            if needed_slots > len(all_slots) and self.repeat_slots:
                repeats = math.ceil(needed_slots / len(all_slots))
                all_slots = all_slots.repeat(repeats)
            allocated_slots = all_slots[:needed_slots]
            slots.append(allocated_slots)
            block_tables.append(allocated_blocks.tolist())
            block_tables_tensor[i, :needed_blocks] = allocated_blocks
            cumulative_blocks += needed_blocks
        block_tables = block_tables
        block_tables_tensor = block_tables_tensor.to(device)
        slots = torch.concat(slots).to(device)
        # Allocate the required number of blocks by setting the mask to 0
        self.free_block_mask[block_indices] = 0
        return block_tables, block_tables_tensor, slots
    def free(self, block_indices: Optional[List[int]]):
        if block_indices is not None and block_indices:
            # Reset mask
            self.free_block_mask[block_indices] = 1
 def set_cache_manager(
    num_blocks: int,
    num_layers: int,
    num_heads: int,
    head_size: int,
    repeat_slots: bool,
    dtype: torch.dtype,
    device: torch.device,
 ) -> CacheManager:
    global CACHE_MANAGER
    if CACHE_MANAGER is not None:
        del CACHE_MANAGER
        torch.cuda.empty_cache()
    CACHE_MANAGER = CacheManager(
        num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
    )
    return CACHE_MANAGER
 def get_cache_manager() -> CacheManager:
    global CACHE_MANAGER
    if CACHE_MANAGER is None:
        raise RuntimeError("cache manager was not initialized")
    return CACHE_MANAGER
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@ -30,6 +30,7 @@ from text_generation_server.layers.attention import (
    attention,
    reshape_and_cache,
 )
 from text_generation_server.models.globals import FLASH_DECODING
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers import (
    TensorParallelRowLinear,
@ -259,8 +260,9 @@ class FlashCohereAttention(torch.nn.Module):
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        cu_seqlen_q,
        cu_seqlen_k,
        slots,
        input_lengths,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
@ -312,7 +314,8 @@ class FlashCohereAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
-                input_lengths,
+                cu_seqlen_q,
                cu_seqlen_k,
                max_s,
            )
@ -386,8 +389,9 @@ class FlashCohereLayer(nn.Module):
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        cu_seqlen_q,
        cu_seqlen_k,
        slots,
        input_lengths,
        max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
@ -400,8 +404,9 @@ class FlashCohereLayer(nn.Module):
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            cu_seqlen_q,
            cu_seqlen_k,
            slots,
            input_lengths,
            max_s,
        )
@ -464,6 +469,24 @@ class FlashCohereModel(torch.nn.Module):
        )
        residual = None
        if cu_seqlen_prefill is None and FLASH_DECODING:
            cu_seqlen_q = torch.arange(
                input_lengths.shape[0] + 1,
                device=input_ids.device,
                dtype=torch.int32,
            )
            cu_seqlen_k = torch.cat(
                [
                    torch.zeros(
                        (1,), device=input_lengths.device, dtype=input_lengths.dtype
                    ),
                    input_lengths.cumsum(dim=-1),
                ]
            ).to(dtype=torch.int32)
        else:
            cu_seqlen_q = None
            cu_seqlen_k = input_lengths
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
@ -473,8 +496,9 @@ class FlashCohereModel(torch.nn.Module):
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                cu_seqlen_q,
                cu_seqlen_k,
                slots,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -344,6 +344,7 @@ class DbrxAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -253,6 +253,7 @@ class FlashGemmaAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@ -253,6 +253,7 @@ class FlashGPT2Attention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -33,6 +33,7 @@ from text_generation_server.layers.attention import (
    attention,
    reshape_and_cache,
 )
 from text_generation_server.models.globals import FLASH_DECODING
 from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
@ -172,7 +173,8 @@ class FlashLlamaAttention(torch.nn.Module):
        kv_cache,
        block_tables,
        slots,
-        input_lengths,
+        cu_seqlen_q,
        cu_seqlen_k,
        max_s,
        adapter_data,
    ):
@ -192,10 +194,10 @@ class FlashLlamaAttention(torch.nn.Module):
        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            attn_output = torch.empty_like(query)
            # flash attention
            attention(
                query,
@ -208,15 +210,16 @@ class FlashLlamaAttention(torch.nn.Module):
            )
        # Decode
        else:
-            paged_attention(
+            attn_output = paged_attention(
-                attn_output,
+                None,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
-                input_lengths,
+                cu_seqlen_q,
                cu_seqlen_k,
                max_s,
            )
@ -353,7 +356,8 @@ class FlashLlamaLayer(nn.Module):
        kv_cache,
        block_tables,
        slots,
-        input_lengths,
+        cu_seqlen_q,
        cu_seqlen_k,
        max_s,
        adapter_data,
    ):
@ -368,7 +372,8 @@ class FlashLlamaLayer(nn.Module):
            kv_cache,
            block_tables,
            slots,
-            input_lengths,
+            cu_seqlen_q,
            cu_seqlen_k,
            max_s,
            adapter_data,
        )
@ -438,6 +443,23 @@ class FlashLlamaModel(torch.nn.Module):
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
        if cu_seqlen_prefill is None and FLASH_DECODING:
            cu_seqlen_q = torch.arange(
                input_lengths.shape[0] + 1,
                device=inputs_embeds.device,
                dtype=torch.int32,
            )
            cu_seqlen_k = torch.cat(
                [
                    torch.zeros(
                        (1,), device=input_lengths.device, dtype=input_lengths.dtype
                    ),
                    input_lengths.cumsum(dim=-1),
                ]
            ).to(dtype=torch.int32)
        else:
            cu_seqlen_q = None
            cu_seqlen_k = input_lengths
        residual = None
        for i, layer in enumerate(self.layers):
@ -450,7 +472,8 @@ class FlashLlamaModel(torch.nn.Module):
                kv_cache[i],
                block_tables,
                slots,
-                input_lengths,
+                cu_seqlen_q,
                cu_seqlen_k,
                max_s,
                adapter_data,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -237,6 +237,7 @@ class MistralAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@ -299,6 +299,7 @@ class MixtralAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -176,6 +176,7 @@ class FlashNeoxAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@ -215,6 +215,7 @@ class FlashPhiAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -157,6 +157,7 @@ class Qwen2Attention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -225,6 +225,7 @@ class FlashRWAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
@ -348,6 +349,7 @@ class FlashRWLargeAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@ -309,6 +309,7 @@ class FlashMQAttention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@ -263,6 +263,7 @@ class Starcoder2Attention(torch.nn.Module):
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                None,
                input_lengths,
                max_s,
            )
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -46,7 +46,7 @@ from text_generation_server.utils.import_utils import (
 tracer = trace.get_tracer(__name__)
-BLOCK_SIZE: int = 16
+BLOCK_SIZE: int = 256 if os.getenv("FLASH_DECODING", "").lower() in {"1", "true"} else 16
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -6,6 +6,9 @@ from typing import Dict
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 # This is overridden by the cli
 cuda_graphs = os.getenv("CUDA_GRAPHS")
 FLASH_DECODING = os.getenv("FLASH_DECODING") in {"1", "true", "True"}
 if FLASH_DECODING:
    logger.info("Using FLASH_DECODING")
 if cuda_graphs is not None:
    try:
        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@ -67,7 +67,7 @@ elif is_ipex_available():
        synchronize = noop
        get_free_memory = get_cpu_free_memory
 else:
-    SYSTEM = "cpu"
+    SYSTEM = "ipex"
    empty_cache = noop
    synchronize = noop