From 69e0a87dd5100d57c5c21ce7195badcf98ed0c77 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Thu, 13 Mar 2025 21:32:38 +0000 Subject: [PATCH] (fix) flashinfer --- server/text_generation_server/layers/attention/cuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index 505fbafa..fb50dda6 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -80,7 +80,7 @@ def paged_attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, - window_size_left=window_size_left, + window_left=window_size_left, ) elif ATTENTION == "flashdecoding": max_q = 1 @@ -257,7 +257,7 @@ def attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, - window_size_left=window_size_left, + window_left=window_size_left, ) # If we are using flashdecoding or paged, we always use flash-attn for