text-generation-inference/router/src/validation.rs

293 lines
8.6 KiB
Rust
Raw Normal View History

use crate::validation::ValidationError::EmptyInput;
2022-10-18 13:19:03 +00:00
/// Payload validation logic
use crate::{GenerateParameters, GenerateRequest};
use rand::rngs::ThreadRng;
use rand::Rng;
use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
2022-10-17 12:59:00 +00:00
use thiserror::Error;
use tokenizers::tokenizer::Tokenizer;
use tokio::sync::{mpsc, oneshot};
2023-02-13 12:02:45 +00:00
use tracing::{instrument, Span};
2022-10-18 13:19:03 +00:00
/// Validation
#[derive(Debug, Clone)]
2022-10-17 16:27:33 +00:00
pub struct Validation {
2022-10-18 13:19:03 +00:00
/// Channel to communicate with the background validation task
sender: mpsc::UnboundedSender<ValidationRequest>,
}
impl Validation {
pub(crate) fn new(
workers: usize,
tokenizer: Tokenizer,
max_stop_sequences: usize,
max_input_length: usize,
max_total_tokens: usize,
) -> Self {
// Create channel
let (validation_sender, validation_receiver) = mpsc::unbounded_channel();
2022-10-18 13:19:03 +00:00
// Launch background validation task
tokio::spawn(validation_task(
workers,
tokenizer,
max_stop_sequences,
2022-10-18 13:19:03 +00:00
max_input_length,
max_total_tokens,
2022-10-18 13:19:03 +00:00
validation_receiver,
));
Self {
sender: validation_sender,
}
}
2022-10-18 13:19:03 +00:00
/// Validate a payload and get the number of tokens in the input
2023-02-13 12:02:45 +00:00
#[instrument(skip_all)]
pub(crate) async fn validate(
&self,
request: GenerateRequest,
) -> Result<ValidGenerateRequest, ValidationError> {
2022-10-18 13:19:03 +00:00
// Create response channel
let (sender, receiver) = oneshot::channel();
2022-10-18 13:19:03 +00:00
// Send request to the background validation task
// Unwrap is safe here
2023-02-13 12:02:45 +00:00
self.sender
.send((request, sender, Span::current()))
.unwrap();
2022-10-18 13:19:03 +00:00
// Await on response channel
// Unwrap is safe here
receiver.await.unwrap()
}
}
2022-10-18 13:19:03 +00:00
/// Validation task
/// Load balance the validation requests between multiple validation workers
async fn validation_task(
workers: usize,
tokenizer: Tokenizer,
max_stop_sequences: usize,
2022-10-18 13:19:03 +00:00
max_input_length: usize,
max_total_tokens: usize,
mut receiver: mpsc::UnboundedReceiver<ValidationRequest>,
2022-10-18 13:19:03 +00:00
) {
let mut workers_senders = Vec::with_capacity(workers);
// Create workers
for _ in 0..workers {
let tokenizer_clone: Tokenizer = tokenizer.clone().into();
2022-10-18 13:19:03 +00:00
// Create channel to communicate with worker
let (worker_sender, worker_receiver) = mpsc::channel(workers);
workers_senders.push(worker_sender);
// Spawn worker
tokio::task::spawn_blocking(move || {
validation_worker(
tokenizer_clone,
max_stop_sequences,
max_input_length,
max_total_tokens,
worker_receiver,
)
2022-10-18 13:19:03 +00:00
});
}
loop {
// Load balance requests between workers
for sender in workers_senders.iter() {
if let Some(validation_request) = receiver.recv().await {
sender.send(validation_request).await.unwrap();
} else {
return;
}
}
}
}
/// Check the parameters inside the payload and get the number of tokens inside the input using
/// the tokenizer
fn validation_worker(
tokenizer: Tokenizer,
max_stop_sequences: usize,
2022-10-18 13:19:03 +00:00
max_input_length: usize,
max_total_tokens: usize,
2022-10-18 13:19:03 +00:00
mut receiver: mpsc::Receiver<ValidationRequest>,
) {
// Seed rng
let mut rng = rand::thread_rng();
2022-10-18 13:19:03 +00:00
// Loop over requests
2023-02-13 12:02:45 +00:00
while let Some((request, response_tx, parent_span)) = receiver.blocking_recv() {
parent_span.in_scope(|| {
response_tx
.send(
validate(
request,
&tokenizer,
max_stop_sequences,
max_input_length,
max_total_tokens,
&mut rng,
)
.map_err(|err| {
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
2023-02-13 12:02:45 +00:00
tracing::error!("{err}");
err
}),
)
.unwrap_or(())
})
}
}
fn validate(
request: GenerateRequest,
tokenizer: &Tokenizer,
max_stop_sequences: usize,
max_input_length: usize,
max_total_tokens: usize,
rng: &mut ThreadRng,
) -> Result<ValidGenerateRequest, ValidationError> {
let GenerateParameters {
temperature,
repetition_penalty,
top_k,
top_p,
do_sample,
max_new_tokens,
stop: stop_sequences,
seed,
..
} = request.parameters;
let temperature = temperature.unwrap_or(1.0);
if temperature <= 0.0 {
return Err(ValidationError::Temperature);
}
let repetition_penalty = repetition_penalty.unwrap_or(1.0);
if repetition_penalty <= 0.0 {
return Err(ValidationError::RepetitionPenalty);
}
let top_p = top_p.unwrap_or(1.0);
if top_p <= 0.0 || top_p > 1.0 {
return Err(ValidationError::TopP);
}
// Different because the proto default value is 0 while it is not a valid value
// for the user
let top_k: u32 = match top_k {
None => Ok(0),
Some(top_k) => {
if top_k <= 0 {
return Err(ValidationError::TopK);
}
Ok(top_k as u32)
}
}?;
if max_new_tokens == 0 {
return Err(ValidationError::MaxNewTokens);
}
if stop_sequences.len() > max_stop_sequences {
return Err(ValidationError::StopSequence(
max_stop_sequences,
stop_sequences.len(),
2023-01-20 11:24:39 +00:00
));
}
// If seed is None, assign a random one
let seed = match seed {
None => rng.gen(),
Some(seed) => seed,
};
// Check if inputs is empty
if request.inputs.is_empty() {
return Err(EmptyInput);
}
// Get the number of tokens in the input
match tokenizer.encode(request.inputs.clone(), true) {
Ok(encoding) => {
let input_length = encoding.len();
let total_tokens = input_length + max_new_tokens as usize;
if input_length > max_input_length {
Err(ValidationError::InputLength(max_input_length, input_length))
} else if total_tokens > max_total_tokens {
Err(ValidationError::MaxTotalTokens(
max_total_tokens,
input_length,
max_new_tokens,
))
} else {
// Return ValidGenerateRequest
let parameters = NextTokenChooserParameters {
temperature,
repetition_penalty,
top_k,
top_p,
do_sample,
seed,
};
let stopping_parameters = StoppingCriteriaParameters {
max_new_tokens,
stop_sequences,
};
metrics::histogram!("tgi_request_input_length", input_length as f64);
metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);
Ok(ValidGenerateRequest {
inputs: request.inputs,
input_length: input_length as u32,
parameters,
stopping_parameters,
})
2022-11-14 16:15:19 +00:00
}
2023-01-20 11:24:39 +00:00
}
Err(err) => Err(ValidationError::Tokenizer(err.to_string())),
}
}
2022-10-18 13:19:03 +00:00
type ValidationRequest = (
GenerateRequest,
oneshot::Sender<Result<ValidGenerateRequest, ValidationError>>,
2023-02-13 12:02:45 +00:00
Span,
2022-10-18 13:19:03 +00:00
);
#[derive(Debug)]
pub(crate) struct ValidGenerateRequest {
pub inputs: String,
pub input_length: u32,
pub parameters: NextTokenChooserParameters,
pub stopping_parameters: StoppingCriteriaParameters,
}
2022-10-18 13:19:03 +00:00
#[derive(Error, Debug)]
pub enum ValidationError {
2022-10-27 12:25:29 +00:00
#[error("temperature must be strictly positive")]
2022-10-18 13:19:03 +00:00
Temperature,
#[error("repetition_penalty must be strictly positive")]
RepetitionPenalty,
#[error("top_p must be > 0.0 and <= 1.0")]
2022-10-18 13:19:03 +00:00
TopP,
2022-10-27 12:25:29 +00:00
#[error("top_k must be strictly positive")]
2022-10-18 13:19:03 +00:00
TopK,
#[error("max_new_tokens must be strictly positive")]
MaxNewTokens,
#[error("input tokens + max_new_tokens must be <= {0}. Given: {1} input tokens and {2} max_new_tokens")]
MaxTotalTokens(usize, usize, u32),
#[error("inputs must have less than {0} tokens. Given: {1}")]
2022-10-21 08:59:15 +00:00
InputLength(usize, usize),
#[error("inputs cannot be empty")]
EmptyInput,
#[error("stop supports up to {0} stop sequences. Given: {1}")]
StopSequence(usize, usize),
2022-11-14 16:15:19 +00:00
#[error("tokenizer error {0}")]
Tokenizer(String),
2022-10-18 13:19:03 +00:00
}