mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 11:24:53 +00:00
add best of sequences to details
This commit is contained in:
parent
9624d4060f
commit
9f4f2fc8e3
@ -185,28 +185,35 @@ impl Infer {
|
||||
&self,
|
||||
request: GenerateRequest,
|
||||
best_of: usize,
|
||||
) -> Result<InferResponse, InferError> {
|
||||
) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
|
||||
// validate best_of parameter separately
|
||||
let best_of = self.validation.validate_best_of(best_of)?;
|
||||
|
||||
// create multiple generate requests
|
||||
let infer_responses: Vec<InferResponse> =
|
||||
let mut infer_responses: Vec<InferResponse> =
|
||||
try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
|
||||
|
||||
// get the sequence with the highest log probability per token
|
||||
let mut max_index = 0;
|
||||
let mut max_logprob: f32 = f32::MIN;
|
||||
let mut best_response = None;
|
||||
for response in infer_responses {
|
||||
// sum logprobs of the generated tokens
|
||||
let sequence_logprob = response.tokens.iter().map(|token| token.logprob).sum();
|
||||
|
||||
for (i, response) in infer_responses.iter().enumerate() {
|
||||
// mean logprobs of the generated tokens
|
||||
let sequence_logprob = response
|
||||
.tokens
|
||||
.iter()
|
||||
.map(|token| token.logprob)
|
||||
.sum::<f32>()
|
||||
/ response.tokens.len() as f32;
|
||||
|
||||
// set best sequence
|
||||
if sequence_logprob > max_logprob {
|
||||
max_index = i;
|
||||
max_logprob = sequence_logprob;
|
||||
best_response = Some(response);
|
||||
}
|
||||
}
|
||||
Ok(best_response.expect("best_response is None. This is a bug."))
|
||||
let best_response = infer_responses.remove(max_index);
|
||||
Ok((best_response, infer_responses))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -168,6 +168,20 @@ pub(crate) enum FinishReason {
|
||||
StopSequence,
|
||||
}
|
||||
|
||||
#[derive(Serialize, ToSchema)]
|
||||
pub(crate) struct BestOfSequence {
|
||||
#[schema(example = "test")]
|
||||
pub generated_text: String,
|
||||
#[schema(example = "length")]
|
||||
pub finish_reason: FinishReason,
|
||||
#[schema(example = 1)]
|
||||
pub generated_tokens: u32,
|
||||
#[schema(example = 42)]
|
||||
pub seed: Option<u64>,
|
||||
pub prefill: Vec<PrefillToken>,
|
||||
pub tokens: Vec<Token>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, ToSchema)]
|
||||
pub(crate) struct Details {
|
||||
#[schema(example = "length")]
|
||||
@ -178,6 +192,8 @@ pub(crate) struct Details {
|
||||
pub seed: Option<u64>,
|
||||
pub prefill: Vec<PrefillToken>,
|
||||
pub tokens: Vec<Token>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub best_of_sequences: Option<Vec<BestOfSequence>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, ToSchema)]
|
||||
|
@ -1,10 +1,10 @@
|
||||
/// HTTP Server logic
|
||||
use crate::infer::{InferError, InferStreamResponse};
|
||||
use crate::infer::{InferError, InferResponse, InferStreamResponse};
|
||||
use crate::validation::ValidationError;
|
||||
use crate::{
|
||||
CompatGenerateRequest, Details, ErrorResponse, FinishReason, GenerateParameters,
|
||||
GenerateRequest, GenerateResponse, Infer, PrefillToken, StreamDetails, StreamResponse, Token,
|
||||
Validation,
|
||||
BestOfSequence, CompatGenerateRequest, Details, ErrorResponse, FinishReason,
|
||||
GenerateParameters, GenerateRequest, GenerateResponse, Infer, PrefillToken, StreamDetails,
|
||||
StreamResponse, Token, Validation,
|
||||
};
|
||||
use axum::extract::Extension;
|
||||
use axum::http::{HeaderMap, Method, StatusCode};
|
||||
@ -87,21 +87,21 @@ async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorRe
|
||||
|
||||
/// Generate tokens
|
||||
#[utoipa::path(
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/generate",
|
||||
request_body = GenerateRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = GenerateResponse),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"})),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"})),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"})),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"})),
|
||||
)
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/generate",
|
||||
request_body = GenerateRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = GenerateResponse),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"})),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"})),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"})),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"})),
|
||||
)
|
||||
)]
|
||||
#[instrument(
|
||||
skip(infer),
|
||||
@ -130,20 +130,51 @@ async fn generate(
|
||||
let details = req.0.parameters.details;
|
||||
|
||||
// Inference
|
||||
let response = match req.0.parameters.best_of {
|
||||
Some(best_of) if best_of > 1 => infer.generate_best_of(req.0, best_of).await?,
|
||||
_ => infer.generate(req.0).await?,
|
||||
let (response, best_of_responses) = match req.0.parameters.best_of {
|
||||
Some(best_of) if best_of > 1 => {
|
||||
let (response, best_of_responses) = infer.generate_best_of(req.0, best_of).await?;
|
||||
(response, Some(best_of_responses))
|
||||
}
|
||||
_ => (infer.generate(req.0).await?, None),
|
||||
};
|
||||
|
||||
// Token details
|
||||
let details = match details {
|
||||
true => Some(Details {
|
||||
true => {
|
||||
// convert best_of_responses
|
||||
let best_of_sequences = best_of_responses.map(|responses: Vec<InferResponse>| {
|
||||
responses
|
||||
.into_iter()
|
||||
.map(|response: InferResponse| {
|
||||
// Add prompt if return_full_text
|
||||
let mut output_text = response.generated_text.text;
|
||||
if let Some(prompt) = &add_prompt {
|
||||
output_text = prompt.clone() + &output_text;
|
||||
}
|
||||
|
||||
BestOfSequence {
|
||||
generated_text: output_text,
|
||||
finish_reason: FinishReason::from(
|
||||
response.generated_text.finish_reason,
|
||||
),
|
||||
generated_tokens: response.generated_text.generated_tokens,
|
||||
prefill: response.prefill,
|
||||
tokens: response.tokens,
|
||||
seed: response.generated_text.seed,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
|
||||
Some(Details {
|
||||
finish_reason: FinishReason::from(response.generated_text.finish_reason),
|
||||
generated_tokens: response.generated_text.generated_tokens,
|
||||
prefill: response.prefill,
|
||||
tokens: response.tokens,
|
||||
seed: response.generated_text.seed,
|
||||
}),
|
||||
best_of_sequences,
|
||||
})
|
||||
}
|
||||
false => None,
|
||||
};
|
||||
|
||||
@ -222,26 +253,26 @@ async fn generate(
|
||||
|
||||
/// Generate a stream of token using Server-Sent Events
|
||||
#[utoipa::path(
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/generate_stream",
|
||||
request_body = GenerateRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = StreamResponse,
|
||||
content_type = "text/event-stream"),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"}),
|
||||
content_type = "text/event-stream"),
|
||||
)
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/generate_stream",
|
||||
request_body = GenerateRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = StreamResponse,
|
||||
content_type = "text/event-stream"),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"}),
|
||||
content_type = "text/event-stream"),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"}),
|
||||
content_type = "text/event-stream"),
|
||||
)
|
||||
)]
|
||||
#[instrument(
|
||||
skip(infer),
|
||||
@ -403,10 +434,10 @@ async fn generate_stream(
|
||||
|
||||
/// Prometheus metrics scrape endpoint
|
||||
#[utoipa::path(
|
||||
get,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/metrics",
|
||||
responses((status = 200, description = "Prometheus Metrics", body = String))
|
||||
get,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/metrics",
|
||||
responses((status = 200, description = "Prometheus Metrics", body = String))
|
||||
)]
|
||||
async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
|
||||
prom_handle.render()
|
||||
@ -444,6 +475,7 @@ pub async fn run(
|
||||
PrefillToken,
|
||||
Token,
|
||||
GenerateResponse,
|
||||
BestOfSequence,
|
||||
Details,
|
||||
FinishReason,
|
||||
StreamResponse,
|
||||
|
Loading…
Reference in New Issue
Block a user