mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Thanks clippy
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
e4d5fa7eaf
commit
fb81c0d1c4
@ -38,7 +38,7 @@ impl FromStr for LlamacppSplitMode {
|
|||||||
"row" => Ok(LlamacppSplitMode::Row),
|
"row" => Ok(LlamacppSplitMode::Row),
|
||||||
_ => match s.parse::<usize>() {
|
_ => match s.parse::<usize>() {
|
||||||
Ok(n) => Ok(LlamacppSplitMode::GPU(n)),
|
Ok(n) => Ok(LlamacppSplitMode::GPU(n)),
|
||||||
Err(_) => Err(format!("Choose a GPU number or `layer` or `row`")),
|
Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -176,8 +176,7 @@ impl LlamacppRequest {
|
|||||||
from: &ValidGenerateRequest,
|
from: &ValidGenerateRequest,
|
||||||
tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
|
tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
|
||||||
) -> Option<Self>{
|
) -> Option<Self>{
|
||||||
if let Some(input_ids) = from.input_ids.as_ref() {
|
from.input_ids.as_ref().map(|input_ids| LlamacppRequest {
|
||||||
Some(LlamacppRequest {
|
|
||||||
input_ids: input_ids.iter().map(|&x| x as i32).collect(),
|
input_ids: input_ids.iter().map(|&x| x as i32).collect(),
|
||||||
top_k: from.parameters.top_k as _,
|
top_k: from.parameters.top_k as _,
|
||||||
top_p: from.parameters.top_p as _,
|
top_p: from.parameters.top_p as _,
|
||||||
@ -190,12 +189,9 @@ impl LlamacppRequest {
|
|||||||
penalty_freq: from.parameters.frequency_penalty as _,
|
penalty_freq: from.parameters.frequency_penalty as _,
|
||||||
penalty_present: 0.0, // disabled
|
penalty_present: 0.0, // disabled
|
||||||
max_new_tokens: from.stopping_parameters.max_new_tokens as _,
|
max_new_tokens: from.stopping_parameters.max_new_tokens as _,
|
||||||
tx: tx,
|
tx,
|
||||||
time: Instant::now(),
|
time: Instant::now(),
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -404,7 +400,7 @@ impl LlamacppSampler {
|
|||||||
for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() {
|
for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() {
|
||||||
*logprob = llamacpp::llama_token_data {
|
*logprob = llamacpp::llama_token_data {
|
||||||
id: token as _,
|
id: token as _,
|
||||||
logit: unsafe { *logits.offset(token as _) },
|
logit: unsafe { *logits.add(token) },
|
||||||
p: 0.0,
|
p: 0.0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -484,7 +480,7 @@ impl LlamacppBackend {
|
|||||||
Ok(Some(request)) => {
|
Ok(Some(request)) => {
|
||||||
let n_tokens_to_add = request.input_ids.len();
|
let n_tokens_to_add = request.input_ids.len();
|
||||||
|
|
||||||
if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens as usize {
|
if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens {
|
||||||
flush(&mut requests, &mut n_tokens);
|
flush(&mut requests, &mut n_tokens);
|
||||||
}
|
}
|
||||||
n_tokens += n_tokens_to_add;
|
n_tokens += n_tokens_to_add;
|
||||||
@ -511,7 +507,7 @@ impl LlamacppBackend {
|
|||||||
let _ = status_tx.send(true);
|
let _ = status_tx.send(true);
|
||||||
|
|
||||||
while let Ok(requests) = sync_rx.recv() {
|
while let Ok(requests) = sync_rx.recv() {
|
||||||
if shutdown_rx.borrow().clone() {
|
if *shutdown_rx.borrow() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
@ -521,7 +517,7 @@ impl LlamacppBackend {
|
|||||||
for (seq_id, request) in requests.iter().enumerate() {
|
for (seq_id, request) in requests.iter().enumerate() {
|
||||||
debug!("Request: {:?}", request);
|
debug!("Request: {:?}", request);
|
||||||
// TODO remove this
|
// TODO remove this
|
||||||
let sampler = match LlamacppSampler::new(&request) {
|
let sampler = match LlamacppSampler::new(request) {
|
||||||
Some(sampler) => sampler,
|
Some(sampler) => sampler,
|
||||||
_ => {
|
_ => {
|
||||||
let _ = request.tx.send(Err(InferError::IncompleteGeneration));
|
let _ = request.tx.send(Err(InferError::IncompleteGeneration));
|
||||||
@ -543,7 +539,7 @@ impl LlamacppBackend {
|
|||||||
batch_pos: llamacpp.batch.n_tokens as usize - 1,
|
batch_pos: llamacpp.batch.n_tokens as usize - 1,
|
||||||
token: llamacpp::LLAMA_TOKEN_NULL,
|
token: llamacpp::LLAMA_TOKEN_NULL,
|
||||||
pos: last_pos as llamacpp::llama_pos + 1,
|
pos: last_pos as llamacpp::llama_pos + 1,
|
||||||
sampler: sampler,
|
sampler,
|
||||||
text: String::with_capacity(1024),
|
text: String::with_capacity(1024),
|
||||||
n_new_tokens: 0,
|
n_new_tokens: 0,
|
||||||
running: true,
|
running: true,
|
||||||
@ -584,8 +580,8 @@ impl LlamacppBackend {
|
|||||||
let token = Token {
|
let token = Token {
|
||||||
id: next as _,
|
id: next as _,
|
||||||
text: piece,
|
text: piece,
|
||||||
logprob: logprob,
|
logprob,
|
||||||
special: special,
|
special,
|
||||||
};
|
};
|
||||||
let finish: Option<FinishReason> = {
|
let finish: Option<FinishReason> = {
|
||||||
if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } {
|
if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } {
|
||||||
@ -598,7 +594,7 @@ impl LlamacppBackend {
|
|||||||
};
|
};
|
||||||
if let Some(reason) = finish {
|
if let Some(reason) = finish {
|
||||||
let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End {
|
let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End {
|
||||||
token: token,
|
token,
|
||||||
top_tokens: vec![],
|
top_tokens: vec![],
|
||||||
generated_text: GeneratedText {
|
generated_text: GeneratedText {
|
||||||
text: seq.text.clone(),
|
text: seq.text.clone(),
|
||||||
@ -613,7 +609,7 @@ impl LlamacppBackend {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate {
|
let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate {
|
||||||
token: token,
|
token,
|
||||||
top_tokens: vec![],
|
top_tokens: vec![],
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
@ -215,7 +215,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
.ok();
|
.ok();
|
||||||
let params = FromPretrainedParameters {
|
let params = FromPretrainedParameters {
|
||||||
revision: args.revision.clone(),
|
revision: args.revision.clone(),
|
||||||
token: token,
|
token,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
Tokenizer::from_pretrained(
|
Tokenizer::from_pretrained(
|
||||||
@ -227,8 +227,8 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
let (backend, ok, shutdown) = LlamacppBackend::new(
|
let (backend, ok, shutdown) = LlamacppBackend::new(
|
||||||
LlamacppConfig {
|
LlamacppConfig {
|
||||||
model_gguf: args.model_gguf,
|
model_gguf: args.model_gguf,
|
||||||
n_threads: n_threads,
|
n_threads,
|
||||||
n_threads_batch: n_threads_batch,
|
n_threads_batch,
|
||||||
n_gpu_layers: args.n_gpu_layers,
|
n_gpu_layers: args.n_gpu_layers,
|
||||||
split_mode: args.split_mode,
|
split_mode: args.split_mode,
|
||||||
defrag_threshold: args.defrag_threshold,
|
defrag_threshold: args.defrag_threshold,
|
||||||
@ -239,9 +239,9 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
type_k: args.type_k,
|
type_k: args.type_k,
|
||||||
type_v: args.type_v,
|
type_v: args.type_v,
|
||||||
offload_kqv: args.offload_kqv,
|
offload_kqv: args.offload_kqv,
|
||||||
max_batch_total_tokens: max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
max_physical_batch_total_tokens: max_physical_batch_total_tokens,
|
max_physical_batch_total_tokens,
|
||||||
max_batch_size: max_batch_size,
|
max_batch_size,
|
||||||
batch_timeout: tokio::time::Duration::from_millis(5),
|
batch_timeout: tokio::time::Duration::from_millis(5),
|
||||||
},
|
},
|
||||||
tokenizer,
|
tokenizer,
|
||||||
|
Loading…
Reference in New Issue
Block a user