From f5ee062cbd5c37ac38b70f15f01bb004c1e881e7 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 20 Aug 2024 09:14:57 +0200 Subject: [PATCH] Disable prefix caching for lora. --- launcher/src/main.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 1f47475b..4eeea02d 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1500,6 +1500,10 @@ fn main() -> Result<(), LauncherError> { match config.head_dim { Some(h) if h == 64 || h == 128 || h == 256 => { // std::env::set_var("ATTENTION", "flashdecoding"); + if args.lora_adapters.is_some() { + tracing::info!("Disabling prefix caching because of lora adapters"); + std::env::set_var("USE_PREFIX_CACHING", "0"); + } } _ => { tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching");