mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
* clean cuda/rocm code in hpu backend, enable flat_hpu Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix TP in pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust block table in hpu to improve performance Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable all the model. not testet yet Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use tensor cache in hpu graph to avoid replay issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add moe support, fix qwen/mistral/mixtral crash Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix phimoe issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * gpt_bigcode could also go pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable dbrx remove some unused code Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * multi-modality initial PR Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust warmup and enable vlm Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix incorrect output in qwen2 idefics if hpu graph is used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove unused quantization code and enable awq/gptq int4 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix gptq issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable fp8 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup prefill remove model where pageattn is not used, set block table to None since it's not used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add warmup_decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove block_tables and prefill_cache_indices which will lead to dynamic shape Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * missing gptj change... Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix some issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove torch.where to fix incorrect output in hpu graph model Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * match the latest vllm_extension ops Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
67 lines
1.8 KiB
Python
67 lines
1.8 KiB
Python
import os
|
|
import torch
|
|
from torch.distributed import ProcessGroup
|
|
from datetime import timedelta
|
|
from loguru import logger
|
|
|
|
# Tensor Parallelism settings
|
|
RANK = int(os.getenv("RANK", "0"))
|
|
WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
|
|
MEMORY_FRACTION = float(os.getenv("HPU_MEMORY_FRACTION", "0.8"))
|
|
|
|
|
|
class FakeBarrier:
|
|
def wait(self):
|
|
pass
|
|
|
|
|
|
class FakeGroup(ProcessGroup):
|
|
def __init__(self, rank, size):
|
|
self._rank = rank
|
|
self._size = size
|
|
super().__init__(rank, size)
|
|
|
|
def allreduce(self, *args, **kwargs):
|
|
return FakeBarrier()
|
|
|
|
def allgather(self, inputs, local_tensor, **kwargs):
|
|
assert (
|
|
len(inputs[0]) == len(local_tensor) == 1
|
|
), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
|
|
for input_ in inputs:
|
|
input_[0].data = local_tensor[0].data
|
|
return FakeBarrier()
|
|
|
|
def barrier(self, *args, **kwargs):
|
|
return FakeBarrier()
|
|
|
|
def size(self):
|
|
return self._size
|
|
|
|
def rank(self):
|
|
return self._rank
|
|
|
|
def _get_backend_name(self):
|
|
return "fake"
|
|
|
|
|
|
def initialize_torch_distributed():
|
|
if WORLD_SIZE == 1:
|
|
return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
|
|
else:
|
|
if os.getenv("DEBUG", None) == "1":
|
|
return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
|
|
|
|
if not torch.distributed.is_initialized():
|
|
# Call the init process.
|
|
torch.distributed.init_process_group(
|
|
backend="hccl",
|
|
world_size=WORLD_SIZE,
|
|
rank=RANK,
|
|
timeout=timedelta(seconds=120),
|
|
)
|
|
else:
|
|
logger.warning("torch.distributed is already initialized.")
|
|
|
|
return torch.distributed.group.WORLD, RANK, WORLD_SIZE
|