add best of sequences to details

This commit is contained in:
OlivierDehaene 2023-03-09 14:27:39 +01:00
parent 9624d4060f
commit 9f4f2fc8e3
3 changed files with 145 additions and 90 deletions

View File

@ -185,28 +185,35 @@ impl Infer {
&self,
request: GenerateRequest,
best_of: usize,
) -> Result<InferResponse, InferError> {
) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
// validate best_of parameter separately
let best_of = self.validation.validate_best_of(best_of)?;
// create multiple generate requests
let infer_responses: Vec<InferResponse> =
let mut infer_responses: Vec<InferResponse> =
try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
// get the sequence with the highest log probability per token
let mut max_index = 0;
let mut max_logprob: f32 = f32::MIN;
let mut best_response = None;
for response in infer_responses {
// sum logprobs of the generated tokens
let sequence_logprob = response.tokens.iter().map(|token| token.logprob).sum();
for (i, response) in infer_responses.iter().enumerate() {
// mean logprobs of the generated tokens
let sequence_logprob = response
.tokens
.iter()
.map(|token| token.logprob)
.sum::<f32>()
/ response.tokens.len() as f32;
// set best sequence
if sequence_logprob > max_logprob {
max_index = i;
max_logprob = sequence_logprob;
best_response = Some(response);
}
}
Ok(best_response.expect("best_response is None. This is a bug."))
let best_response = infer_responses.remove(max_index);
Ok((best_response, infer_responses))
}
}

View File

@ -168,6 +168,20 @@ pub(crate) enum FinishReason {
StopSequence,
}
#[derive(Serialize, ToSchema)]
pub(crate) struct BestOfSequence {
#[schema(example = "test")]
pub generated_text: String,
#[schema(example = "length")]
pub finish_reason: FinishReason,
#[schema(example = 1)]
pub generated_tokens: u32,
#[schema(example = 42)]
pub seed: Option<u64>,
pub prefill: Vec<PrefillToken>,
pub tokens: Vec<Token>,
}
#[derive(Serialize, ToSchema)]
pub(crate) struct Details {
#[schema(example = "length")]
@ -178,6 +192,8 @@ pub(crate) struct Details {
pub seed: Option<u64>,
pub prefill: Vec<PrefillToken>,
pub tokens: Vec<Token>,
#[serde(skip_serializing_if = "Option::is_none")]
pub best_of_sequences: Option<Vec<BestOfSequence>>,
}
#[derive(Serialize, ToSchema)]

View File

@ -1,10 +1,10 @@
/// HTTP Server logic
use crate::infer::{InferError, InferStreamResponse};
use crate::infer::{InferError, InferResponse, InferStreamResponse};
use crate::validation::ValidationError;
use crate::{
CompatGenerateRequest, Details, ErrorResponse, FinishReason, GenerateParameters,
GenerateRequest, GenerateResponse, Infer, PrefillToken, StreamDetails, StreamResponse, Token,
Validation,
BestOfSequence, CompatGenerateRequest, Details, ErrorResponse, FinishReason,
GenerateParameters, GenerateRequest, GenerateResponse, Infer, PrefillToken, StreamDetails,
StreamResponse, Token, Validation,
};
use axum::extract::Extension;
use axum::http::{HeaderMap, Method, StatusCode};
@ -130,20 +130,51 @@ async fn generate(
let details = req.0.parameters.details;
// Inference
let response = match req.0.parameters.best_of {
Some(best_of) if best_of > 1 => infer.generate_best_of(req.0, best_of).await?,
_ => infer.generate(req.0).await?,
let (response, best_of_responses) = match req.0.parameters.best_of {
Some(best_of) if best_of > 1 => {
let (response, best_of_responses) = infer.generate_best_of(req.0, best_of).await?;
(response, Some(best_of_responses))
}
_ => (infer.generate(req.0).await?, None),
};
// Token details
let details = match details {
true => Some(Details {
true => {
// convert best_of_responses
let best_of_sequences = best_of_responses.map(|responses: Vec<InferResponse>| {
responses
.into_iter()
.map(|response: InferResponse| {
// Add prompt if return_full_text
let mut output_text = response.generated_text.text;
if let Some(prompt) = &add_prompt {
output_text = prompt.clone() + &output_text;
}
BestOfSequence {
generated_text: output_text,
finish_reason: FinishReason::from(
response.generated_text.finish_reason,
),
generated_tokens: response.generated_text.generated_tokens,
prefill: response.prefill,
tokens: response.tokens,
seed: response.generated_text.seed,
}
})
.collect()
});
Some(Details {
finish_reason: FinishReason::from(response.generated_text.finish_reason),
generated_tokens: response.generated_text.generated_tokens,
prefill: response.prefill,
tokens: response.tokens,
seed: response.generated_text.seed,
}),
best_of_sequences,
})
}
false => None,
};
@ -444,6 +475,7 @@ pub async fn run(
PrefillToken,
Token,
GenerateResponse,
BestOfSequence,
Details,
FinishReason,
StreamResponse,