mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 07:42:06 +00:00
Remove useless files
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
07a0e2f7e6
commit
c112ef1796
@ -1,6 +1,4 @@
|
||||
import os
|
||||
import psutil
|
||||
import signal
|
||||
import sys
|
||||
import typer
|
||||
|
||||
@ -115,67 +113,6 @@ def serve(
|
||||
raise RuntimeError(
|
||||
"Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
|
||||
)
|
||||
|
||||
logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
|
||||
|
||||
if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
|
||||
tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
|
||||
num_shard = int(os.getenv("WORLD_SIZE", "1"))
|
||||
logger.info("CLI SHARDED = {}".format(num_shard))
|
||||
import subprocess
|
||||
|
||||
cmd = (
|
||||
f"deepspeed --num_nodes 1 --num_gpus {num_shard} --no_local_rank {tgi_file}"
|
||||
)
|
||||
cmd += f" --model_id {model_id} --revision {revision} --sharded {sharded}"
|
||||
cmd += f" --dtype {dtype} --trust_remote_code {trust_remote_code} --uds_path {uds_path}"
|
||||
cmd += f" --quantize {quantize} --max_input_tokens {max_input_tokens}"
|
||||
if speculate is not None:
|
||||
cmd += f"--speculate {speculate}"
|
||||
logger.info("CLI server start deepspeed ={} ".format(cmd))
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
with subprocess.Popen(cmd, shell=True, executable="/bin/bash") as proc:
|
||||
do_terminate = False
|
||||
current_handler = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
def terminate_handler(sig, frame):
|
||||
nonlocal do_terminate
|
||||
do_terminate = True
|
||||
if callable(current_handler):
|
||||
current_handler(sig, frame)
|
||||
|
||||
signal.signal(signal.SIGTERM, terminate_handler)
|
||||
|
||||
finished = False
|
||||
while not finished:
|
||||
try:
|
||||
if do_terminate:
|
||||
parent = psutil.Process(proc.pid)
|
||||
all_procs = parent.children(recursive=True) + [parent]
|
||||
for p in all_procs:
|
||||
try:
|
||||
p.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
_, alive = psutil.wait_procs(all_procs, timeout=30)
|
||||
for p in alive:
|
||||
p.kill()
|
||||
|
||||
do_terminate = False
|
||||
|
||||
proc.wait(timeout=3)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
else:
|
||||
finished = True
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
if proc.returncode != 0:
|
||||
logger.error(f"{cmd} exited with status = {proc.returncode}")
|
||||
return proc.returncode
|
||||
else:
|
||||
server.serve(
|
||||
model_id,
|
||||
lora_adapters,
|
||||
|
@ -1,53 +0,0 @@
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
|
||||
|
||||
import os
|
||||
import habana_frameworks.torch as htorch
|
||||
|
||||
quant_config = os.getenv("QUANT_CONFIG", "")
|
||||
is_quantization_enabled = quant_config != ""
|
||||
|
||||
if is_quantization_enabled:
|
||||
os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
|
||||
os.environ.setdefault("USE_DEFAULT_QUANT_PARAM", "true")
|
||||
os.environ.setdefault("UPDATE_GRAPH_OUTPUT_MME", "false")
|
||||
os.environ.setdefault("ENABLE_CALC_DYNAMIC_RANGE", "false")
|
||||
os.environ.setdefault("UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
|
||||
os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
|
||||
|
||||
|
||||
def patch_scoped_linear_all_reduce(model):
|
||||
from deepspeed.module_inject.layers import LinearAllreduce
|
||||
from optimum.habana.transformers.models.modeling_all_models import (
|
||||
ScopedLinearAllReduce,
|
||||
)
|
||||
|
||||
for name, module in model.named_children():
|
||||
if type(module) is LinearAllreduce:
|
||||
SL = ScopedLinearAllReduce(mod=module)
|
||||
setattr(model, name, SL)
|
||||
patch_scoped_linear_all_reduce(module)
|
||||
|
||||
|
||||
def setup_quantization(model):
|
||||
if is_quantization_enabled:
|
||||
htorch.core.quantization._mark_params_as_const(model)
|
||||
htorch.core.quantization._check_params_as_const(model)
|
||||
htorch.core.hpu_initialize(model)
|
||||
return model
|
||||
|
||||
|
||||
def prepare_model_for_quantization(model):
|
||||
if is_quantization_enabled:
|
||||
if model.config.model_type in [
|
||||
"llama",
|
||||
"falcon",
|
||||
"qwen2",
|
||||
"starcoder2",
|
||||
"gemma",
|
||||
]:
|
||||
patch_scoped_linear_all_reduce(model)
|
||||
from neural_compressor.torch.quantization import FP8Config, convert
|
||||
|
||||
config = FP8Config.from_json_file(quant_config)
|
||||
model = convert(model, config)
|
||||
return model
|
Loading…
Reference in New Issue
Block a user