mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-24 17:00:18 +00:00
(fix) flashinfer
This commit is contained in:
parent
ff82f0f84c
commit
69e0a87dd5
@ -80,7 +80,7 @@ def paged_attention(
|
|||||||
sm_scale=softmax_scale,
|
sm_scale=softmax_scale,
|
||||||
k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
|
k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
|
||||||
v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
|
v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
|
||||||
window_size_left=window_size_left,
|
window_left=window_size_left,
|
||||||
)
|
)
|
||||||
elif ATTENTION == "flashdecoding":
|
elif ATTENTION == "flashdecoding":
|
||||||
max_q = 1
|
max_q = 1
|
||||||
@ -257,7 +257,7 @@ def attention(
|
|||||||
sm_scale=softmax_scale,
|
sm_scale=softmax_scale,
|
||||||
k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
|
k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
|
||||||
v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
|
v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
|
||||||
window_size_left=window_size_left,
|
window_left=window_size_left,
|
||||||
)
|
)
|
||||||
|
|
||||||
# If we are using flashdecoding or paged, we always use flash-attn for
|
# If we are using flashdecoding or paged, we always use flash-attn for
|
||||||
|
Loading…
Reference in New Issue
Block a user