mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 03:44:54 +00:00
Put more wiggle room.
This commit is contained in:
parent
375802948d
commit
90989a4a04
@ -15,9 +15,9 @@ PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in {
|
|||||||
PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "1").lower() in {"1", "true"}
|
PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "1").lower() in {"1", "true"}
|
||||||
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
||||||
_expected = {"paged", "flashdecoding", "flashdecoding-ipex", "flashinfer"}
|
_expected = {"paged", "flashdecoding", "flashdecoding-ipex", "flashinfer"}
|
||||||
assert (
|
assert ATTENTION in _expected, (
|
||||||
ATTENTION in _expected
|
f"Attention is not valid {ATTENTION}, expected {_expected}"
|
||||||
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
)
|
||||||
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
||||||
|
|
||||||
if PREFIX_CACHING and ATTENTION not in {
|
if PREFIX_CACHING and ATTENTION not in {
|
||||||
@ -28,7 +28,8 @@ if PREFIX_CACHING and ATTENTION not in {
|
|||||||
raise RuntimeError("Prefix caching is only supported with flashinfer")
|
raise RuntimeError("Prefix caching is only supported with flashinfer")
|
||||||
|
|
||||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||||
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93"))
|
# Test a 70B model on 4xA100 under load for latest failure
|
||||||
|
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
|
||||||
assert TGI_WIGGLE_ROOM > 0
|
assert TGI_WIGGLE_ROOM > 0
|
||||||
assert TGI_WIGGLE_ROOM < 1
|
assert TGI_WIGGLE_ROOM < 1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user