mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
fix: warn window_size_left when using flash attn 1
This commit is contained in:
parent
4b1005c7e1
commit
5123925101
@ -3,6 +3,7 @@ from text_generation_server.utils.import_utils import SYSTEM
|
||||
from text_generation_server.models.globals import FLASH_DECODING, BLOCK_SIZE
|
||||
from text_generation_server.layers.attention import Seqlen
|
||||
from typing import Optional
|
||||
import warnings
|
||||
|
||||
major, minor = torch.cuda.get_device_capability()
|
||||
is_sm75 = major == 7 and minor == 5
|
||||
@ -289,9 +290,11 @@ else:
|
||||
softcap=None,
|
||||
):
|
||||
if window_size_left != -1:
|
||||
raise NotImplementedError(
|
||||
"window_size_left is only available with flash attn v2"
|
||||
warnings.warn(
|
||||
"window_size_left is only available with flash attn v2. It will be ignored.",
|
||||
UserWarning,
|
||||
)
|
||||
|
||||
if softcap is not None:
|
||||
raise NotImplementedError("softcap is only available with flash attn v2")
|
||||
|
||||
@ -338,3 +341,6 @@ else:
|
||||
0,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
SUPPORTS_WINDOWING = True
|
||||
|
Loading…
Reference in New Issue
Block a user