mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
Early exit on server too.
This commit is contained in:
parent
6bcad66c6e
commit
d84b98b40f
1
.gitignore
vendored
1
.gitignore
vendored
@ -18,3 +18,4 @@ server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
|
||||
|
||||
data/
|
||||
load_tests/*.json
|
||||
server/fbgemmm
|
||||
|
@ -6,6 +6,8 @@ from typing import Dict, Optional
|
||||
from text_generation_server.utils.log import log_master
|
||||
|
||||
ATTENTION = os.getenv("ATTENTION", "paged")
|
||||
_expected = {"paged", "flashdecoding", "flashinfer"}
|
||||
assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
|
||||
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
||||
|
||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||
|
Loading…
Reference in New Issue
Block a user