diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 19696372..fbe35828 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -15,9 +15,9 @@ PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in { PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "1").lower() in {"1", "true"} log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}") _expected = {"paged", "flashdecoding", "flashdecoding-ipex", "flashinfer"} -assert ( - ATTENTION in _expected -), f"Attention is not valid {ATTENTION}, expected {_expected}" +assert ATTENTION in _expected, ( + f"Attention is not valid {ATTENTION}, expected {_expected}" +) log_master(logger.info, f"Using Attention = {ATTENTION}") if PREFIX_CACHING and ATTENTION not in { @@ -28,7 +28,8 @@ if PREFIX_CACHING and ATTENTION not in { raise RuntimeError("Prefix caching is only supported with flashinfer") MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None -TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93")) +# Test a 70B model on 4xA100 under load for latest failure +TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90")) assert TGI_WIGGLE_ROOM > 0 assert TGI_WIGGLE_ROOM < 1