Remove n_ctx

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-02-05 11:31:58 +00:00
parent 051ff2d5ce
commit 09a745f1b8
No known key found for this signature in database
2 changed files with 2 additions and 21 deletions

View File

@ -130,7 +130,6 @@ impl LlamacppGGMLType {
pub struct LlamacppConfig {
pub model_gguf: String,
pub n_ctx: usize,
pub max_batch_total_tokens: usize,
pub max_physical_batch_total_tokens: usize,
pub max_batch_size: usize,
@ -206,7 +205,6 @@ struct Llamacpp {
vocab: *const llamacpp::llama_vocab,
logprobs: Vec<llamacpp::llama_token_data>,
batch: llamacpp::llama_batch,
n_ctx: u32,
}
extern "C" fn llamacpp_log_callback(
@ -251,7 +249,7 @@ impl Llamacpp {
}
let ctx = unsafe {
let mut params = llamacpp::context_default_params();
params.n_ctx = conf.n_ctx as _;
params.n_ctx = conf.max_batch_total_tokens as _;
params.n_batch = conf.max_batch_total_tokens as _;
params.n_ubatch = conf.max_physical_batch_total_tokens as _;
params.n_seq_max = conf.max_batch_size as _;
@ -268,8 +266,6 @@ impl Llamacpp {
if ctx.is_null() {
return Err(BackendError::Llamacpp("Failed to init context".to_string()))
}
let n_ctx = unsafe { llamacpp::n_ctx(ctx) };
let vocab = unsafe {
llamacpp::model_get_vocab(model)
};
@ -291,7 +287,7 @@ impl Llamacpp {
let batch = unsafe {
llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1)
};
Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch})
Ok(Llamacpp{model, ctx, vocab, logprobs, batch})
}
fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) {
@ -559,9 +555,6 @@ impl LlamacppBackend {
}
break;
}
let kv_cache_used_cells = unsafe {
llamacpp::get_kv_cache_used_cells(llamacpp.ctx)
};
for seq in seqs.iter_mut() {
if !seq.running {
continue;
@ -595,8 +588,6 @@ impl LlamacppBackend {
Some(FinishReason::EndOfSequenceToken)
} else if seq.n_new_tokens == requests[seq.id].max_new_tokens {
Some(FinishReason::Length)
} else if kv_cache_used_cells == llamacpp.n_ctx as i32 {
Some(FinishReason::Length) // TODO: check
} else {
None
}

View File

@ -24,10 +24,6 @@ struct Args {
#[clap(long, env)]
model_gguf: String, // TODO Option() with hf->gguf & quantize
/// Context size for the model.
#[clap(default_value = "4096", long, env)]
n_ctx: usize,
/// Number of threads to use for generation.
#[clap(long, env)]
n_threads: Option<usize>,
@ -198,11 +194,6 @@ async fn main() -> Result<(), RouterError> {
"`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
));
}
if args.max_batch_total_tokens > args.n_ctx {
return Err(RouterError::ArgumentValidation(
"`max_batch_total_tokens` must be <= `n_ctx`".to_string(),
));
}
// TODO: check if we use the same cache of Server
// check if llamacpp is faster
@ -224,7 +215,6 @@ async fn main() -> Result<(), RouterError> {
let (backend, ok, shutdown) = LlamacppBackend::new(
LlamacppConfig {
model_gguf: args.model_gguf,
n_ctx: args.n_ctx,
n_threads: n_threads,
n_threads_batch: n_threads_batch,
n_gpu_layers: args.n_gpu_layers,