2025-02-28 11:14:58 +00:00
|
|
|
import os
|
|
|
|
from typing import Dict, Optional
|
|
|
|
from loguru import logger
|
|
|
|
from text_generation_server.utils.log import log_master
|
|
|
|
|
Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113)
* clean cuda/rocm code in hpu backend, enable flat_hpu
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix TP in pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust block table in hpu to improve performance
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable all the model. not testet yet
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* use tensor cache in hpu graph to avoid replay issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add moe support, fix qwen/mistral/mixtral crash
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix phimoe issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* gpt_bigcode could also go pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable dbrx remove some unused code
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* multi-modality initial PR
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust warmup and enable vlm
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix incorrect output in qwen2 idefics if hpu graph is used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove unused quantization code and enable awq/gptq int4
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix gptq issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable fp8
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup prefill
remove model where pageattn is not used, set block table to None since it's not used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add warmup_decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove block_tables and prefill_cache_indices which will lead to dynamic shape
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix comment
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* missing gptj change...
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix some issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove torch.where to fix incorrect output in hpu graph model
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* match the latest vllm_extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---------
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-04-14 13:58:13 +00:00
|
|
|
REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
|
2025-02-28 11:14:58 +00:00
|
|
|
ATTENTION = os.getenv("ATTENTION", "default")
|
|
|
|
# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
|
|
|
|
PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
|
|
|
|
"1",
|
|
|
|
"true",
|
|
|
|
}
|
|
|
|
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
|
Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113)
* clean cuda/rocm code in hpu backend, enable flat_hpu
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix TP in pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust block table in hpu to improve performance
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable all the model. not testet yet
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* use tensor cache in hpu graph to avoid replay issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add moe support, fix qwen/mistral/mixtral crash
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix phimoe issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* gpt_bigcode could also go pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable dbrx remove some unused code
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* multi-modality initial PR
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust warmup and enable vlm
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix incorrect output in qwen2 idefics if hpu graph is used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove unused quantization code and enable awq/gptq int4
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix gptq issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable fp8
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup prefill
remove model where pageattn is not used, set block table to None since it's not used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add warmup_decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove block_tables and prefill_cache_indices which will lead to dynamic shape
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix comment
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* missing gptj change...
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix some issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove torch.where to fix incorrect output in hpu graph model
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* match the latest vllm_extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---------
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-04-14 13:58:13 +00:00
|
|
|
_expected = {"paged", "default"}
|
2025-02-28 11:14:58 +00:00
|
|
|
assert (
|
|
|
|
ATTENTION in _expected
|
|
|
|
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
|
|
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
|
|
|
|
|
|
|
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
|
|
|
|
assert TGI_WIGGLE_ROOM > 0
|
|
|
|
assert TGI_WIGGLE_ROOM < 1
|
|
|
|
|
|
|
|
# This is overridden by the cli
|
|
|
|
BLOCK_SIZE: int
|
|
|
|
|
Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113)
* clean cuda/rocm code in hpu backend, enable flat_hpu
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix TP in pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust block table in hpu to improve performance
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable all the model. not testet yet
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* use tensor cache in hpu graph to avoid replay issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add moe support, fix qwen/mistral/mixtral crash
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix phimoe issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* gpt_bigcode could also go pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable dbrx remove some unused code
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* multi-modality initial PR
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* adjust warmup and enable vlm
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix incorrect output in qwen2 idefics if hpu graph is used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove unused quantization code and enable awq/gptq int4
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix gptq issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* enable fp8
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup prefill
remove model where pageattn is not used, set block table to None since it's not used
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* add warmup_decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* warmup decode
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove block_tables and prefill_cache_indices which will lead to dynamic shape
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix comment
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* missing gptj change...
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* fix some issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* remove torch.where to fix incorrect output in hpu graph model
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
* match the latest vllm_extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---------
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-04-14 13:58:13 +00:00
|
|
|
BLOCK_SIZE = 128
|
2025-02-28 11:14:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
# This is overridden at model loading.
|
|
|
|
global MODEL_ID
|
|
|
|
MODEL_ID = None
|
|
|
|
|
|
|
|
|
|
|
|
def set_model_id(model_id: str):
|
|
|
|
global MODEL_ID
|
|
|
|
MODEL_ID = model_id
|
|
|
|
|
|
|
|
|
|
|
|
# NOTE: eventually we should move this into the router and pass back the
|
|
|
|
# index in all cases.
|
|
|
|
ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None
|
|
|
|
|
|
|
|
|
|
|
|
def set_adapter_to_index(adapter_to_index: Dict[str, int]):
|
|
|
|
global ADAPTER_TO_INDEX
|
|
|
|
ADAPTER_TO_INDEX = adapter_to_index
|
|
|
|
|
|
|
|
|
|
|
|
def get_adapter_to_index():
|
|
|
|
global ADAPTER_TO_INDEX
|
|
|
|
return ADAPTER_TO_INDEX
|