Apply suggestions from code review

Co-authored-by: drbh <david.richard.holtz@gmail.com>
2025-09-10 11:54:52 +00:00 · 2024-08-21 09:03:28 +02:00 · 2024-08-21 09:03:28 +02:00 · 3ece76392b
commit 3ece76392b
parent cdbf73eef8
2 changed files with 0 additions and 2 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1498,7 +1498,6 @@ fn main() -> Result<(), LauncherError> {
            let config: Config = config.into();
            match config.head_dim {
                Some(h) if h == 64 || h == 128 || h == 256 => {
                    // std::env::set_var("ATTENTION", "flashdecoding");
                    if args.lora_adapters.is_some() {
                        tracing::info!("Disabling prefix caching because of lora adapters");
                        std::env::set_var("USE_PREFIX_CACHING", "0");
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -233,7 +233,6 @@ if ATTENTION == "flashinfer":
        causal=True,
        softcap=0.0,
    ):
        # assert window_size_left == -1, "Windowing is not supported with flash infer"
        from text_generation_server.layers.attention.flashinfer import (
            prefill_with_paged_kv_state,
        )