diff --git a/router/src/infer.rs b/router/src/infer.rs index e917f68f..224d4c5d 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -223,7 +223,6 @@ impl Infer { (result_generated_text, result_queued, result_start) { Ok(InferResponse { - prompt_token_count: valid_request.input_length, prefill: result_prefill, _input_length, tokens: result_tokens, diff --git a/router/src/lib.rs b/router/src/lib.rs index c756065e..9716bde4 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -5,7 +5,6 @@ mod queue; pub mod server; mod validation; -use crate::validation::ValidGenerateRequest; use infer::{Infer, InferError, InferStreamResponse}; use queue::{Entry, Queue}; use serde::{Deserialize, Serialize}; @@ -17,7 +16,7 @@ use validation::Validation; /// Type alias for generation responses pub(crate) type GenerateStreamResponse = ( OwnedSemaphorePermit, - ValidGenerateRequest, + u32, // input_length UnboundedReceiverStream>, ); @@ -233,9 +232,9 @@ impl ChatCompletion { finish_reason: details.finish_reason.to_string(), }], usage: Usage { - prompt_tokens: details.prompt_token_count, + prompt_tokens: details.input_length, completion_tokens: details.generated_tokens, - total_tokens: details.prompt_token_count + details.generated_tokens, + total_tokens: details.input_length + details.generated_tokens, }, } } @@ -471,7 +470,7 @@ pub(crate) struct Details { #[serde(skip_serializing_if = "Vec::is_empty")] pub top_tokens: Vec>, #[schema(example = 1)] - pub prompt_token_count: u32, + pub input_length: u32, } #[derive(Serialize, ToSchema)] diff --git a/router/src/server.rs b/router/src/server.rs index a51c9033..e1a15c24 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -208,7 +208,7 @@ async fn generate( seed: response.generated_text.seed, best_of_sequences, top_tokens: response.top_tokens, - prompt_token_count: response.prompt_token_count, + input_length: response.input_length, }) } false => None,