mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
Fixing prefix attention.
This commit is contained in:
parent
44a77dcb9e
commit
0c90550e9d
@ -21,7 +21,6 @@
|
||||
from contextlib import contextmanager
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from loguru import logger
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
|
@ -6,8 +6,7 @@ from typing import Dict, Optional
|
||||
from text_generation_server.utils.log import log_master
|
||||
|
||||
PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", False)
|
||||
log_master(logger.info, f"Using Attention = {PREFIX_CACHING}")
|
||||
|
||||
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
||||
ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged")
|
||||
_expected = {"paged", "flashdecoding", "flashinfer"}
|
||||
assert (
|
||||
|
Loading…
Reference in New Issue
Block a user