Early exit on server too.

2025-09-12 12:54:52 +00:00 · 2024-08-09 12:47:39 +02:00 · 2024-08-09 12:47:39 +02:00 · d84b98b40f
commit d84b98b40f
parent 6bcad66c6e
2 changed files with 3 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,3 +18,4 @@ server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp

 data/
 load_tests/*.json
+server/fbgemmm
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -6,6 +6,8 @@ from typing import Dict, Optional
 from text_generation_server.utils.log import log_master

 ATTENTION = os.getenv("ATTENTION", "paged")
+_expected  = {"paged", "flashdecoding", "flashinfer"}
+assert ATTENTION in _expected, f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")

 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None