Remove generated files.

2025-09-10 11:54:52 +00:00 · 2024-10-21 15:24:38 +02:00 · 2024-10-21 15:24:38 +02:00 · a31db04709
commit a31db04709
parent 79469f5f39
5 changed files with 2 additions and 1324 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,8 @@ router/tokenizer.json
 backends/v2/src/client/pb
 backends/v3/src/client/pb
 backends/client/src/v2/pb
 backends/client/src/v3/pb
 # ROCm auto-generated files
 *.hip
--- a/backends/client/src/v2/pb/generate.v2.rs
+++ b/backends/client/src/v2/pb/generate.v2.rs
@ -1,613 +0,0 @@
 // This file is @generated by prost-build.
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthResponse {}
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoResponse {
    #[prost(bool, tag = "1")]
    pub requires_padding: bool,
    #[prost(string, tag = "2")]
    pub dtype: ::prost::alloc::string::String,
    #[prost(string, tag = "3")]
    pub device_type: ::prost::alloc::string::String,
    #[prost(uint32, optional, tag = "4")]
    pub window_size: ::core::option::Option<u32>,
    #[prost(uint32, tag = "5")]
    pub speculate: u32,
 }
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryResponse {
    /// / Other shards urls
    #[prost(string, repeated, tag = "1")]
    pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheRequest {
    /// / Optional batch id
    #[prost(uint64, optional, tag = "1")]
    pub id: ::core::option::Option<u64>,
 }
 /// / Empty response
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheResponse {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct NextTokenChooserParameters {
    /// / exponential scaling output probability distribution
    #[prost(float, tag = "1")]
    pub temperature: f32,
    /// / restricting to the k highest probability elements
    #[prost(uint32, tag = "2")]
    pub top_k: u32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "3")]
    pub top_p: f32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "4")]
    pub typical_p: f32,
    /// / apply sampling on the logits
    #[prost(bool, tag = "5")]
    pub do_sample: bool,
    /// / random seed for sampling
    #[prost(uint64, tag = "6")]
    pub seed: u64,
    /// / repetition penalty
    #[prost(float, tag = "7")]
    pub repetition_penalty: f32,
    /// / frequency penalty
    #[prost(float, tag = "9")]
    pub frequency_penalty: f32,
    /// / token watermarking using "A Watermark for Large Language Models"
    #[prost(bool, tag = "8")]
    pub watermark: bool,
    /// / grammar (applied if not empty)
    #[prost(string, tag = "10")]
    pub grammar: ::prost::alloc::string::String,
    /// / grammar type
    #[prost(enumeration = "GrammarType", tag = "11")]
    pub grammar_type: i32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct StoppingCriteriaParameters {
    /// / Maximum number of generated tokens
    #[prost(uint32, tag = "1")]
    pub max_new_tokens: u32,
    /// / Optional stopping sequences
    #[prost(string, repeated, tag = "2")]
    pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / Ignore end of sequence token
    /// / used for benchmarking
    #[prost(bool, tag = "3")]
    pub ignore_eos_token: bool,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Request {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / The generation context
    #[prost(string, tag = "2")]
    pub inputs: ::prost::alloc::string::String,
    /// / Context truncation
    #[prost(uint32, tag = "3")]
    pub truncate: u32,
    /// / Next Token Chooser Parameters
    #[prost(message, optional, tag = "4")]
    pub parameters: ::core::option::Option<NextTokenChooserParameters>,
    /// / Stopping Criteria Parameters
    #[prost(message, optional, tag = "5")]
    pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
    /// / Return prefill logprobs
    #[prost(bool, tag = "6")]
    pub prefill_logprobs: bool,
    /// / Return most likely n tokens
    #[prost(uint32, tag = "7")]
    pub top_n_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Batch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests
    #[prost(message, repeated, tag = "2")]
    pub requests: ::prost::alloc::vec::Vec<Request>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct CachedBatch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests ids
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GeneratedText {
    /// / Output
    #[prost(string, tag = "1")]
    pub text: ::prost::alloc::string::String,
    /// / Number of generated tokens
    #[prost(uint32, tag = "2")]
    pub generated_tokens: u32,
    /// / Finish reason
    #[prost(enumeration = "FinishReason", tag = "3")]
    pub finish_reason: i32,
    /// / Seed
    #[prost(uint64, optional, tag = "4")]
    pub seed: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Tokens {
    /// / Token IDs
    #[prost(uint32, repeated, tag = "1")]
    pub ids: ::prost::alloc::vec::Vec<u32>,
    /// / Logprobs
    #[prost(float, repeated, tag = "2")]
    pub logprobs: ::prost::alloc::vec::Vec<f32>,
    /// / tokens
    #[prost(string, repeated, tag = "3")]
    pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / special
    #[prost(bool, repeated, tag = "4")]
    pub is_special: ::prost::alloc::vec::Vec<bool>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Generation {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub request_id: u64,
    /// / Prefill tokens (optional)
    #[prost(message, optional, tag = "2")]
    pub prefill_tokens: ::core::option::Option<Tokens>,
    #[prost(message, optional, tag = "3")]
    pub tokens: ::core::option::Option<Tokens>,
    /// / Complete generated text
    #[prost(message, optional, tag = "4")]
    pub generated_text: ::core::option::Option<GeneratedText>,
    /// / Top tokens
    #[prost(message, repeated, tag = "5")]
    pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchRequest {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub batch_id: u64,
    /// / Requests to keep
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchResponse {
    /// / Filtered Batch (cached)
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillRequest {
    /// / Batch
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillResponse {
    /// / Generation
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeRequest {
    /// / Cached batches
    #[prost(message, repeated, tag = "1")]
    pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeResponse {
    /// / Decodes
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
    /// / Concatenate elapsed time in nanoseconds
    #[prost(uint64, optional, tag = "6")]
    pub concat_ns: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupRequest {
    /// / Batch to warmup on
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
    #[prost(uint32, tag = "2")]
    pub max_input_length: u32,
    #[prost(uint32, tag = "3")]
    pub max_prefill_tokens: u32,
    #[prost(uint32, tag = "4")]
    pub max_total_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupResponse {
    /// / Maximum number of tokens supported by the model
    #[prost(uint32, optional, tag = "1")]
    pub max_supported_total_tokens: ::core::option::Option<u32>,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum GrammarType {
    None = 0,
    Json = 1,
    Regex = 2,
 }
 impl GrammarType {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            GrammarType::None => "GRAMMAR_TYPE_NONE",
            GrammarType::Json => "GRAMMAR_TYPE_JSON",
            GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "GRAMMAR_TYPE_NONE" => Some(Self::None),
            "GRAMMAR_TYPE_JSON" => Some(Self::Json),
            "GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
            _ => None,
        }
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum FinishReason {
    Length = 0,
    EosToken = 1,
    StopSequence = 2,
 }
 impl FinishReason {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            FinishReason::Length => "FINISH_REASON_LENGTH",
            FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
            FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "FINISH_REASON_LENGTH" => Some(Self::Length),
            "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
            "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
            _ => None,
        }
    }
 }
 /// Generated client implementations.
 pub mod text_generation_service_client {
    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
    use tonic::codegen::http::Uri;
    use tonic::codegen::*;
    #[derive(Debug, Clone)]
    pub struct TextGenerationServiceClient<T> {
        inner: tonic::client::Grpc<T>,
    }
    impl TextGenerationServiceClient<tonic::transport::Channel> {
        /// Attempt to create a new client by connecting to a given endpoint.
        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
        where
            D: TryInto<tonic::transport::Endpoint>,
            D::Error: Into<StdError>,
        {
            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
            Ok(Self::new(conn))
        }
    }
    impl<T> TextGenerationServiceClient<T>
    where
        T: tonic::client::GrpcService<tonic::body::BoxBody>,
        T::Error: Into<StdError>,
        T::ResponseBody: Body<Data = Bytes> + Send + 'static,
        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
    {
        pub fn new(inner: T) -> Self {
            let inner = tonic::client::Grpc::new(inner);
            Self { inner }
        }
        pub fn with_origin(inner: T, origin: Uri) -> Self {
            let inner = tonic::client::Grpc::with_origin(inner, origin);
            Self { inner }
        }
        pub fn with_interceptor<F>(
            inner: T,
            interceptor: F,
        ) -> TextGenerationServiceClient<InterceptedService<T, F>>
        where
            F: tonic::service::Interceptor,
            T::ResponseBody: Default,
            T: tonic::codegen::Service<
                http::Request<tonic::body::BoxBody>,
                Response = http::Response<
                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
                >,
            >,
            <T as tonic::codegen::Service<http::Request<tonic::body::BoxBody>>>::Error:
                Into<StdError> + Send + Sync,
        {
            TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
        }
        /// Compress requests with the given encoding.
        ///
        /// This requires the server to support it otherwise it might respond with an
        /// error.
        #[must_use]
        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.send_compressed(encoding);
            self
        }
        /// Enable decompressing responses.
        #[must_use]
        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.accept_compressed(encoding);
            self
        }
        /// Limits the maximum size of a decoded message.
        ///
        /// Default: `4MB`
        #[must_use]
        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_decoding_message_size(limit);
            self
        }
        /// Limits the maximum size of an encoded message.
        ///
        /// Default: `usize::MAX`
        #[must_use]
        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_encoding_message_size(limit);
            self
        }
        /// / Model Info
        pub async fn info(
            &mut self,
            request: impl tonic::IntoRequest<super::InfoRequest>,
        ) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Info");
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
            self.inner.unary(req, path, codec).await
        }
        /// / Service discovery
        pub async fn service_discovery(
            &mut self,
            request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
        ) -> std::result::Result<tonic::Response<super::ServiceDiscoveryResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/ServiceDiscovery",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "ServiceDiscovery",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Empties batch cache
        pub async fn clear_cache(
            &mut self,
            request: impl tonic::IntoRequest<super::ClearCacheRequest>,
        ) -> std::result::Result<tonic::Response<super::ClearCacheResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/ClearCache",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "ClearCache",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Remove requests from a cached batch
        pub async fn filter_batch(
            &mut self,
            request: impl tonic::IntoRequest<super::FilterBatchRequest>,
        ) -> std::result::Result<tonic::Response<super::FilterBatchResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v2.TextGenerationService/FilterBatch",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "FilterBatch",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Warmup the model and compute max cache size
        pub async fn warmup(
            &mut self,
            request: impl tonic::IntoRequest<super::WarmupRequest>,
        ) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Warmup");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "Warmup",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Prefill batch and decode first token
        pub async fn prefill(
            &mut self,
            request: impl tonic::IntoRequest<super::PrefillRequest>,
        ) -> std::result::Result<tonic::Response<super::PrefillResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Prefill");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "Prefill",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Decode token for a list of prefilled batches
        pub async fn decode(
            &mut self,
            request: impl tonic::IntoRequest<super::DecodeRequest>,
        ) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Decode");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "Decode",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Health check
        pub async fn health(
            &mut self,
            request: impl tonic::IntoRequest<super::HealthRequest>,
        ) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Health");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v2.TextGenerationService",
                "Health",
            ));
            self.inner.unary(req, path, codec).await
        }
    }
 }
--- a/backends/client/src/v2/pb/mod.rs
+++ b/backends/client/src/v2/pb/mod.rs
@ -1,6 +0,0 @@
 // This file is @generated by prost-build.
 pub mod generate {
    pub mod v2 {
        include!("generate.v2.rs");
    }
 }
--- a/backends/client/src/v3/pb/generate.v3.rs
+++ b/backends/client/src/v3/pb/generate.v3.rs
@ -1,699 +0,0 @@
 // This file is @generated by prost-build.
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct HealthResponse {}
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InfoResponse {
    #[prost(bool, tag = "1")]
    pub requires_padding: bool,
    #[prost(string, tag = "2")]
    pub dtype: ::prost::alloc::string::String,
    #[prost(string, tag = "3")]
    pub device_type: ::prost::alloc::string::String,
    #[prost(uint32, optional, tag = "4")]
    pub window_size: ::core::option::Option<u32>,
    #[prost(uint32, tag = "5")]
    pub speculate: u32,
    #[prost(bool, tag = "6")]
    pub support_chunking: bool,
    #[prost(bool, tag = "7")]
    pub use_prefix_caching: bool,
    #[prost(string, tag = "8")]
    pub attention_impl: ::prost::alloc::string::String,
    #[prost(uint32, tag = "9")]
    pub block_size: u32,
 }
 /// / Empty request
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryRequest {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ServiceDiscoveryResponse {
    /// / Other shards urls
    #[prost(string, repeated, tag = "1")]
    pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheRequest {
    /// / Optional batch id
    #[prost(uint64, optional, tag = "1")]
    pub id: ::core::option::Option<u64>,
 }
 /// / Empty response
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct ClearCacheResponse {}
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Image {
    /// / Binary image data.
    #[prost(bytes = "vec", tag = "1")]
    pub data: ::prost::alloc::vec::Vec<u8>,
    /// / Image MIME type.
    #[prost(string, tag = "2")]
    pub mimetype: ::prost::alloc::string::String,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct InputChunk {
    #[prost(oneof = "input_chunk::Chunk", tags = "1, 2")]
    pub chunk: ::core::option::Option<input_chunk::Chunk>,
 }
 /// Nested message and enum types in `InputChunk`.
 pub mod input_chunk {
    #[allow(clippy::derive_partial_eq_without_eq)]
    #[derive(Clone, PartialEq, ::prost::Oneof)]
    pub enum Chunk {
        /// / Plain text data
        #[prost(string, tag = "1")]
        Text(::prost::alloc::string::String),
        /// / Image data
        #[prost(message, tag = "2")]
        Image(super::Image),
    }
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Input {
    #[prost(message, repeated, tag = "1")]
    pub chunks: ::prost::alloc::vec::Vec<InputChunk>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct NextTokenChooserParameters {
    /// / exponential scaling output probability distribution
    #[prost(float, tag = "1")]
    pub temperature: f32,
    /// / restricting to the k highest probability elements
    #[prost(uint32, tag = "2")]
    pub top_k: u32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "3")]
    pub top_p: f32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    #[prost(float, tag = "4")]
    pub typical_p: f32,
    /// / apply sampling on the logits
    #[prost(bool, tag = "5")]
    pub do_sample: bool,
    /// / random seed for sampling
    #[prost(uint64, tag = "6")]
    pub seed: u64,
    /// / repetition penalty
    #[prost(float, tag = "7")]
    pub repetition_penalty: f32,
    /// / frequency penalty
    #[prost(float, tag = "9")]
    pub frequency_penalty: f32,
    /// / token watermarking using "A Watermark for Large Language Models"
    #[prost(bool, tag = "8")]
    pub watermark: bool,
    /// / grammar (applied if not empty)
    #[prost(string, tag = "10")]
    pub grammar: ::prost::alloc::string::String,
    /// / grammar type
    #[prost(enumeration = "GrammarType", tag = "11")]
    pub grammar_type: i32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct StoppingCriteriaParameters {
    /// / Maximum number of generated tokens
    #[prost(uint32, tag = "1")]
    pub max_new_tokens: u32,
    /// / Optional stopping sequences
    #[prost(string, repeated, tag = "2")]
    pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / Ignore end of sequence token
    /// / used for benchmarking
    #[prost(bool, tag = "3")]
    pub ignore_eos_token: bool,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Request {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / The generation context as chunks
    #[prost(message, optional, tag = "8")]
    pub input_chunks: ::core::option::Option<Input>,
    /// / The generation context, stringified input_chunks
    #[prost(string, tag = "2")]
    pub inputs: ::prost::alloc::string::String,
    /// / Context truncation
    #[prost(uint32, tag = "3")]
    pub truncate: u32,
    /// / Next Token Chooser Parameters
    #[prost(message, optional, tag = "4")]
    pub parameters: ::core::option::Option<NextTokenChooserParameters>,
    /// / Stopping Criteria Parameters
    #[prost(message, optional, tag = "5")]
    pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
    /// / Return prefill logprobs
    #[prost(bool, tag = "6")]
    pub prefill_logprobs: bool,
    /// / Return most likely n tokens
    #[prost(uint32, tag = "7")]
    pub top_n_tokens: u32,
    /// / Paged attention blocks
    #[prost(uint32, repeated, tag = "9")]
    pub blocks: ::prost::alloc::vec::Vec<u32>,
    /// / Paged attention slots
    #[prost(uint32, repeated, tag = "10")]
    pub slots: ::prost::alloc::vec::Vec<u32>,
    /// / LORA adapter index
    #[prost(string, optional, tag = "11")]
    pub adapter_id: ::core::option::Option<::prost::alloc::string::String>,
    /// / Tokens that can be retrieved from the KV cache.
    /// / This value is set for the first prefill and never reset
    #[prost(uint32, tag = "12")]
    pub cache_len: u32,
    /// / Context truncation
    #[prost(bool, tag = "13")]
    pub add_special_tokens: bool,
    /// / Chunk of tokens that must be computed for the first prefill
    /// / This value is set for the first prefill and never reset
    #[prost(uint32, optional, tag = "14")]
    pub chunk_len: ::core::option::Option<u32>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Batch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests
    #[prost(message, repeated, tag = "2")]
    pub requests: ::prost::alloc::vec::Vec<Request>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
    /// / Maximum number of Paged Attention blocks
    #[prost(uint32, tag = "5")]
    pub max_blocks: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct CachedBatch {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub id: u64,
    /// / Individual requests ids
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
    /// / Batch size (==len(requests))
    #[prost(uint32, tag = "3")]
    pub size: u32,
    /// / Maximum number of tokens this batch will grow to
    #[prost(uint32, tag = "4")]
    pub max_tokens: u32,
    /// / Number of tokens in the next forward
    #[prost(uint32, tag = "5")]
    pub current_tokens: u32,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct GeneratedText {
    /// / Output
    #[prost(string, tag = "1")]
    pub text: ::prost::alloc::string::String,
    /// / Number of generated tokens
    #[prost(uint32, tag = "2")]
    pub generated_tokens: u32,
    /// / Finish reason
    #[prost(enumeration = "FinishReason", tag = "3")]
    pub finish_reason: i32,
    /// / Seed
    #[prost(uint64, optional, tag = "4")]
    pub seed: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Tokens {
    /// / Token IDs
    #[prost(uint32, repeated, tag = "1")]
    pub ids: ::prost::alloc::vec::Vec<u32>,
    /// / Logprobs
    #[prost(float, repeated, tag = "2")]
    pub logprobs: ::prost::alloc::vec::Vec<f32>,
    /// / tokens
    #[prost(string, repeated, tag = "3")]
    pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
    /// / special
    #[prost(bool, repeated, tag = "4")]
    pub is_special: ::prost::alloc::vec::Vec<bool>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct Generation {
    /// / Request ID
    #[prost(uint64, tag = "1")]
    pub request_id: u64,
    /// / Prefill tokens (optional)
    #[prost(message, optional, tag = "2")]
    pub prefill_tokens: ::core::option::Option<Tokens>,
    #[prost(message, optional, tag = "3")]
    pub tokens: ::core::option::Option<Tokens>,
    /// / Complete generated text
    #[prost(message, optional, tag = "4")]
    pub generated_text: ::core::option::Option<GeneratedText>,
    /// / Top tokens
    #[prost(message, repeated, tag = "5")]
    pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchRequest {
    /// / Batch ID
    #[prost(uint64, tag = "1")]
    pub batch_id: u64,
    /// / Requests to keep
    #[prost(uint64, repeated, tag = "2")]
    pub request_ids: ::prost::alloc::vec::Vec<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct FilterBatchResponse {
    /// / Filtered Batch (cached)
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillRequest {
    /// / Batch
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
    /// / Optional cached batch
    #[prost(message, optional, tag = "2")]
    pub cached_batch: ::core::option::Option<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct PrefillResponse {
    /// / Generation
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
    /// / Concatenate elapsed time in nanoseconds
    #[prost(uint64, optional, tag = "6")]
    pub concat_ns: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeRequest {
    /// / Cached batches
    #[prost(message, repeated, tag = "1")]
    pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct DecodeResponse {
    /// / Decodes
    #[prost(message, repeated, tag = "1")]
    pub generations: ::prost::alloc::vec::Vec<Generation>,
    /// / Next batch (cached)
    #[prost(message, optional, tag = "2")]
    pub batch: ::core::option::Option<CachedBatch>,
    /// / Forward elapsed time in nanoseconds
    #[prost(uint64, tag = "3")]
    pub forward_ns: u64,
    /// / Decode elapsed time in nanoseconds
    #[prost(uint64, tag = "4")]
    pub decode_ns: u64,
    /// / Total elapsed time in nanoseconds
    #[prost(uint64, tag = "5")]
    pub total_ns: u64,
    /// / Concatenate elapsed time in nanoseconds
    #[prost(uint64, optional, tag = "6")]
    pub concat_ns: ::core::option::Option<u64>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupRequest {
    /// / Batch to warmup on
    #[prost(message, optional, tag = "1")]
    pub batch: ::core::option::Option<Batch>,
    #[prost(uint32, optional, tag = "2")]
    pub max_input_tokens: ::core::option::Option<u32>,
    #[prost(uint32, tag = "3")]
    pub max_prefill_tokens: u32,
    #[prost(uint32, optional, tag = "4")]
    pub max_total_tokens: ::core::option::Option<u32>,
 }
 #[allow(clippy::derive_partial_eq_without_eq)]
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct WarmupResponse {
    /// / Maximum number of tokens supported by the model
    #[prost(uint32, optional, tag = "1")]
    pub max_supported_total_tokens: ::core::option::Option<u32>,
    /// / Maximum input tokens by clients should be equal to request value if it's set
    /// / Otherwise warmup automatically allocates a value here
    #[prost(uint32, tag = "2")]
    pub max_input_tokens: u32,
    /// / Maximum total tokens by clients should be equal to request value if it's set
    /// / Otherwise warmup automatically allocates a value here
    #[prost(uint32, tag = "3")]
    pub max_total_tokens: u32,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum GrammarType {
    None = 0,
    Json = 1,
    Regex = 2,
 }
 impl GrammarType {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            GrammarType::None => "GRAMMAR_TYPE_NONE",
            GrammarType::Json => "GRAMMAR_TYPE_JSON",
            GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "GRAMMAR_TYPE_NONE" => Some(Self::None),
            "GRAMMAR_TYPE_JSON" => Some(Self::Json),
            "GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
            _ => None,
        }
    }
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
 #[repr(i32)]
 pub enum FinishReason {
    Length = 0,
    EosToken = 1,
    StopSequence = 2,
 }
 impl FinishReason {
    /// String value of the enum field names used in the ProtoBuf definition.
    ///
    /// The values are not transformed in any way and thus are considered stable
    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
    pub fn as_str_name(&self) -> &'static str {
        match self {
            FinishReason::Length => "FINISH_REASON_LENGTH",
            FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
            FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
        }
    }
    /// Creates an enum from field names used in the ProtoBuf definition.
    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
        match value {
            "FINISH_REASON_LENGTH" => Some(Self::Length),
            "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
            "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
            _ => None,
        }
    }
 }
 /// Generated client implementations.
 pub mod text_generation_service_client {
    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
    use tonic::codegen::http::Uri;
    use tonic::codegen::*;
    #[derive(Debug, Clone)]
    pub struct TextGenerationServiceClient<T> {
        inner: tonic::client::Grpc<T>,
    }
    impl TextGenerationServiceClient<tonic::transport::Channel> {
        /// Attempt to create a new client by connecting to a given endpoint.
        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
        where
            D: TryInto<tonic::transport::Endpoint>,
            D::Error: Into<StdError>,
        {
            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
            Ok(Self::new(conn))
        }
    }
    impl<T> TextGenerationServiceClient<T>
    where
        T: tonic::client::GrpcService<tonic::body::BoxBody>,
        T::Error: Into<StdError>,
        T::ResponseBody: Body<Data = Bytes> + Send + 'static,
        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
    {
        pub fn new(inner: T) -> Self {
            let inner = tonic::client::Grpc::new(inner);
            Self { inner }
        }
        pub fn with_origin(inner: T, origin: Uri) -> Self {
            let inner = tonic::client::Grpc::with_origin(inner, origin);
            Self { inner }
        }
        pub fn with_interceptor<F>(
            inner: T,
            interceptor: F,
        ) -> TextGenerationServiceClient<InterceptedService<T, F>>
        where
            F: tonic::service::Interceptor,
            T::ResponseBody: Default,
            T: tonic::codegen::Service<
                http::Request<tonic::body::BoxBody>,
                Response = http::Response<
                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
                >,
            >,
            <T as tonic::codegen::Service<http::Request<tonic::body::BoxBody>>>::Error:
                Into<StdError> + Send + Sync,
        {
            TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
        }
        /// Compress requests with the given encoding.
        ///
        /// This requires the server to support it otherwise it might respond with an
        /// error.
        #[must_use]
        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.send_compressed(encoding);
            self
        }
        /// Enable decompressing responses.
        #[must_use]
        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
            self.inner = self.inner.accept_compressed(encoding);
            self
        }
        /// Limits the maximum size of a decoded message.
        ///
        /// Default: `4MB`
        #[must_use]
        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_decoding_message_size(limit);
            self
        }
        /// Limits the maximum size of an encoded message.
        ///
        /// Default: `usize::MAX`
        #[must_use]
        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
            self.inner = self.inner.max_encoding_message_size(limit);
            self
        }
        /// / Model Info
        pub async fn info(
            &mut self,
            request: impl tonic::IntoRequest<super::InfoRequest>,
        ) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Info");
            let mut req = request.into_request();
            req.extensions_mut()
                .insert(GrpcMethod::new("generate.v3.TextGenerationService", "Info"));
            self.inner.unary(req, path, codec).await
        }
        /// / Service discovery
        pub async fn service_discovery(
            &mut self,
            request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
        ) -> std::result::Result<tonic::Response<super::ServiceDiscoveryResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v3.TextGenerationService/ServiceDiscovery",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "ServiceDiscovery",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Empties batch cache
        pub async fn clear_cache(
            &mut self,
            request: impl tonic::IntoRequest<super::ClearCacheRequest>,
        ) -> std::result::Result<tonic::Response<super::ClearCacheResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v3.TextGenerationService/ClearCache",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "ClearCache",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Remove requests from a cached batch
        pub async fn filter_batch(
            &mut self,
            request: impl tonic::IntoRequest<super::FilterBatchRequest>,
        ) -> std::result::Result<tonic::Response<super::FilterBatchResponse>, tonic::Status>
        {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/generate.v3.TextGenerationService/FilterBatch",
            );
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "FilterBatch",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Warmup the model and compute max cache size
        pub async fn warmup(
            &mut self,
            request: impl tonic::IntoRequest<super::WarmupRequest>,
        ) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Warmup");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "Warmup",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Prefill batch and decode first token
        pub async fn prefill(
            &mut self,
            request: impl tonic::IntoRequest<super::PrefillRequest>,
        ) -> std::result::Result<tonic::Response<super::PrefillResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Prefill");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "Prefill",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Decode token for a list of prefilled batches
        pub async fn decode(
            &mut self,
            request: impl tonic::IntoRequest<super::DecodeRequest>,
        ) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Decode");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "Decode",
            ));
            self.inner.unary(req, path, codec).await
        }
        /// / Health check
        pub async fn health(
            &mut self,
            request: impl tonic::IntoRequest<super::HealthRequest>,
        ) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Health");
            let mut req = request.into_request();
            req.extensions_mut().insert(GrpcMethod::new(
                "generate.v3.TextGenerationService",
                "Health",
            ));
            self.inner.unary(req, path, codec).await
        }
    }
 }
--- a/backends/client/src/v3/pb/mod.rs
+++ b/backends/client/src/v3/pb/mod.rs
@ -1,6 +0,0 @@
 // This file is @generated by prost-build.
 pub mod generate {
    pub mod v3 {
        include!("generate.v3.rs");
    }
 }