mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
Early exit on server too.
This commit is contained in:
parent
6bcad66c6e
commit
d84b98b40f
1
.gitignore
vendored
1
.gitignore
vendored
@ -18,3 +18,4 @@ server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
|
|||||||
|
|
||||||
data/
|
data/
|
||||||
load_tests/*.json
|
load_tests/*.json
|
||||||
|
server/fbgemmm
|
||||||
|
@ -6,6 +6,8 @@ from typing import Dict, Optional
|
|||||||
from text_generation_server.utils.log import log_master
|
from text_generation_server.utils.log import log_master
|
||||||
|
|
||||||
ATTENTION = os.getenv("ATTENTION", "paged")
|
ATTENTION = os.getenv("ATTENTION", "paged")
|
||||||
|
_expected = {"paged", "flashdecoding", "flashinfer"}
|
||||||
|
assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
|
||||||
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
||||||
|
|
||||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||||
|
Loading…
Reference in New Issue
Block a user