Remove n_ctx

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-06-19 15:52:08 +00:00 · 2025-02-05 11:31:58 +00:00 · 2025-02-05 11:31:58 +00:00 · 09a745f1b8
commit 09a745f1b8
parent 051ff2d5ce
2 changed files with 2 additions and 21 deletions
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -130,7 +130,6 @@ impl LlamacppGGMLType {

 pub struct LlamacppConfig {
    pub model_gguf: String,
-    pub n_ctx: usize,
    pub max_batch_total_tokens: usize,
    pub max_physical_batch_total_tokens: usize,
    pub max_batch_size: usize,
@ -206,7 +205,6 @@ struct Llamacpp {
    vocab: *const llamacpp::llama_vocab,
    logprobs: Vec<llamacpp::llama_token_data>,
    batch: llamacpp::llama_batch,
-    n_ctx: u32,
 }

 extern "C" fn llamacpp_log_callback(
@ -251,7 +249,7 @@ impl Llamacpp {
        }
        let ctx = unsafe {
            let mut params = llamacpp::context_default_params();
-            params.n_ctx           = conf.n_ctx as _;
+            params.n_ctx           = conf.max_batch_total_tokens as _;
            params.n_batch         = conf.max_batch_total_tokens as _;
            params.n_ubatch        = conf.max_physical_batch_total_tokens as _;
            params.n_seq_max       = conf.max_batch_size as _;
@ -268,8 +266,6 @@ impl Llamacpp {
        if ctx.is_null() {
            return Err(BackendError::Llamacpp("Failed to init context".to_string()))
        }
-        let n_ctx = unsafe { llamacpp::n_ctx(ctx) };
-
        let vocab = unsafe {
            llamacpp::model_get_vocab(model)
        };
@ -291,7 +287,7 @@ impl Llamacpp {
        let batch = unsafe {
            llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1)
        };
-        Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch})
+        Ok(Llamacpp{model, ctx, vocab, logprobs, batch})
    }

    fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) {
@ -559,9 +555,6 @@ impl LlamacppBackend {
                        }
                        break;
                    }
-                    let kv_cache_used_cells = unsafe {
-                        llamacpp::get_kv_cache_used_cells(llamacpp.ctx)
-                    };
                    for seq in seqs.iter_mut() {
                        if !seq.running {
                            continue;
@ -595,8 +588,6 @@ impl LlamacppBackend {
                                Some(FinishReason::EndOfSequenceToken)
                            } else if seq.n_new_tokens == requests[seq.id].max_new_tokens {
                                Some(FinishReason::Length)
-                            } else if kv_cache_used_cells == llamacpp.n_ctx as i32 {
-                                Some(FinishReason::Length) // TODO: check
                            } else {
                                None
                            }
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -24,10 +24,6 @@ struct Args {
    #[clap(long, env)]
    model_gguf: String, // TODO Option() with hf->gguf & quantize

-    /// Context size for the model.
-    #[clap(default_value = "4096", long, env)]
-    n_ctx: usize,
-
    /// Number of threads to use for generation.
    #[clap(long, env)]
    n_threads: Option<usize>,
@ -198,11 +194,6 @@ async fn main() -> Result<(), RouterError> {
            "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
        ));
    }
-    if args.max_batch_total_tokens > args.n_ctx {
-        return Err(RouterError::ArgumentValidation(
-            "`max_batch_total_tokens` must be <= `n_ctx`".to_string(),
-        ));
-    }

    // TODO: check if we use the same cache of Server
    // check if llamacpp is faster
@ -224,7 +215,6 @@ async fn main() -> Result<(), RouterError> {
    let (backend, ok, shutdown) = LlamacppBackend::new(
        LlamacppConfig {
            model_gguf:                      args.model_gguf,
-            n_ctx:                           args.n_ctx,
            n_threads:                       n_threads,
            n_threads_batch:                 n_threads_batch,
            n_gpu_layers:                    args.n_gpu_layers,