mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 06:12:07 +00:00
Update window size rocm flash decoding
This commit is contained in:
parent
b30cdabf68
commit
170a12f331
@ -83,6 +83,8 @@ def paged_attention(
|
|||||||
max_k = max_s
|
max_k = max_s
|
||||||
import flash_attn_2_cuda
|
import flash_attn_2_cuda
|
||||||
|
|
||||||
|
window_size_right = -1 if window_size_left == -1 else 0
|
||||||
|
|
||||||
if softcap is None:
|
if softcap is None:
|
||||||
softcap = 0.0
|
softcap = 0.0
|
||||||
out = flash_attn_2_cuda.varlen_fwd(
|
out = flash_attn_2_cuda.varlen_fwd(
|
||||||
@ -102,8 +104,8 @@ def paged_attention(
|
|||||||
softmax_scale,
|
softmax_scale,
|
||||||
False, # zero_tensors
|
False, # zero_tensors
|
||||||
True, # causal
|
True, # causal
|
||||||
-1, # Window_left
|
window_size_left, # Window_left
|
||||||
-1, # Window right
|
window_size_right, # Window right
|
||||||
softcap,
|
softcap,
|
||||||
False, # return softmax
|
False, # return softmax
|
||||||
None, # generator
|
None, # generator
|
||||||
|
Loading…
Reference in New Issue
Block a user