2025-02-28 11:14:58 +00:00
|
|
|
import os
|
|
|
|
from typing import Dict, Optional
|
|
|
|
from loguru import logger
|
|
|
|
from text_generation_server.utils.log import log_master
|
|
|
|
|
2025-03-14 02:21:44 +00:00
|
|
|
REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
|
2025-02-28 11:14:58 +00:00
|
|
|
ATTENTION = os.getenv("ATTENTION", "default")
|
|
|
|
# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
|
|
|
|
PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
|
|
|
|
"1",
|
|
|
|
"true",
|
|
|
|
}
|
|
|
|
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
2025-03-14 02:21:44 +00:00
|
|
|
_expected = {"paged", "default"}
|
2025-02-28 11:14:58 +00:00
|
|
|
assert (
|
|
|
|
ATTENTION in _expected
|
|
|
|
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
|
|
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
|
|
|
|
|
|
|
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
|
|
|
|
assert TGI_WIGGLE_ROOM > 0
|
|
|
|
assert TGI_WIGGLE_ROOM < 1
|
|
|
|
|
|
|
|
# This is overridden by the cli
|
|
|
|
BLOCK_SIZE: int
|
|
|
|
|
2025-03-14 02:21:44 +00:00
|
|
|
BLOCK_SIZE = 128
|
2025-02-28 11:14:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
# This is overridden at model loading.
|
|
|
|
global MODEL_ID
|
|
|
|
MODEL_ID = None
|
|
|
|
|
|
|
|
|
|
|
|
def set_model_id(model_id: str):
|
|
|
|
global MODEL_ID
|
|
|
|
MODEL_ID = model_id
|
|
|
|
|
|
|
|
|
|
|
|
# NOTE: eventually we should move this into the router and pass back the
|
|
|
|
# index in all cases.
|
|
|
|
ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None
|
|
|
|
|
|
|
|
|
|
|
|
def set_adapter_to_index(adapter_to_index: Dict[str, int]):
|
|
|
|
global ADAPTER_TO_INDEX
|
|
|
|
ADAPTER_TO_INDEX = adapter_to_index
|
|
|
|
|
|
|
|
|
|
|
|
def get_adapter_to_index():
|
|
|
|
global ADAPTER_TO_INDEX
|
|
|
|
return ADAPTER_TO_INDEX
|