mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
* clean cuda/rocm code in hpu backend, enable flat_hpu Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix TP in pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust block table in hpu to improve performance Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable all the model. not testet yet Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use tensor cache in hpu graph to avoid replay issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add moe support, fix qwen/mistral/mixtral crash Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix phimoe issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * gpt_bigcode could also go pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable dbrx remove some unused code Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * multi-modality initial PR Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust warmup and enable vlm Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix incorrect output in qwen2 idefics if hpu graph is used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove unused quantization code and enable awq/gptq int4 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix gptq issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable fp8 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup prefill remove model where pageattn is not used, set block table to None since it's not used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add warmup_decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove block_tables and prefill_cache_indices which will lead to dynamic shape Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * missing gptj change... Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix some issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove torch.where to fix incorrect output in hpu graph model Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * match the latest vllm_extension ops Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
53 lines
1.4 KiB
Python
53 lines
1.4 KiB
Python
import os
|
|
from typing import Dict, Optional
|
|
from loguru import logger
|
|
from text_generation_server.utils.log import log_master
|
|
|
|
REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
|
|
ATTENTION = os.getenv("ATTENTION", "default")
|
|
# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
|
|
PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
|
|
"1",
|
|
"true",
|
|
}
|
|
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
|
_expected = {"paged", "default"}
|
|
assert (
|
|
ATTENTION in _expected
|
|
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
|
|
|
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
|
|
assert TGI_WIGGLE_ROOM > 0
|
|
assert TGI_WIGGLE_ROOM < 1
|
|
|
|
# This is overridden by the cli
|
|
BLOCK_SIZE: int
|
|
|
|
BLOCK_SIZE = 128
|
|
|
|
|
|
# This is overridden at model loading.
|
|
global MODEL_ID
|
|
MODEL_ID = None
|
|
|
|
|
|
def set_model_id(model_id: str):
|
|
global MODEL_ID
|
|
MODEL_ID = model_id
|
|
|
|
|
|
# NOTE: eventually we should move this into the router and pass back the
|
|
# index in all cases.
|
|
ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None
|
|
|
|
|
|
def set_adapter_to_index(adapter_to_index: Dict[str, int]):
|
|
global ADAPTER_TO_INDEX
|
|
ADAPTER_TO_INDEX = adapter_to_index
|
|
|
|
|
|
def get_adapter_to_index():
|
|
global ADAPTER_TO_INDEX
|
|
return ADAPTER_TO_INDEX
|