Remove useless files

Signed-off-by: yuanwu <yuan.wu@intel.com>
2025-06-19 07:42:06 +00:00 · 2025-06-11 03:27:19 +00:00 · 2025-06-11 03:27:19 +00:00 · c112ef1796
commit c112ef1796
parent 07a0e2f7e6
2 changed files with 13 additions and 129 deletions
--- a/backends/gaudi/server/text_generation_server/cli.py
+++ b/backends/gaudi/server/text_generation_server/cli.py
@ -1,6 +1,4 @@
 import os
-import psutil
-import signal
 import sys
 import typer

@ -115,67 +113,6 @@ def serve(
        raise RuntimeError(
            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
        )
-
-    logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
-
-    if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
-        tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
-        num_shard = int(os.getenv("WORLD_SIZE", "1"))
-        logger.info("CLI SHARDED = {}".format(num_shard))
-        import subprocess
-
-        cmd = (
-            f"deepspeed --num_nodes 1 --num_gpus {num_shard} --no_local_rank {tgi_file}"
-        )
-        cmd += f" --model_id {model_id} --revision {revision} --sharded {sharded}"
-        cmd += f" --dtype {dtype} --trust_remote_code {trust_remote_code} --uds_path {uds_path}"
-        cmd += f" --quantize {quantize} --max_input_tokens {max_input_tokens}"
-        if speculate is not None:
-            cmd += f"--speculate {speculate}"
-        logger.info("CLI server start deepspeed ={} ".format(cmd))
-        sys.stdout.flush()
-        sys.stderr.flush()
-        with subprocess.Popen(cmd, shell=True, executable="/bin/bash") as proc:
-            do_terminate = False
-            current_handler = signal.getsignal(signal.SIGTERM)
-
-            def terminate_handler(sig, frame):
-                nonlocal do_terminate
-                do_terminate = True
-                if callable(current_handler):
-                    current_handler(sig, frame)
-
-            signal.signal(signal.SIGTERM, terminate_handler)
-
-            finished = False
-            while not finished:
-                try:
-                    if do_terminate:
-                        parent = psutil.Process(proc.pid)
-                        all_procs = parent.children(recursive=True) + [parent]
-                        for p in all_procs:
-                            try:
-                                p.terminate()
-                            except psutil.NoSuchProcess:
-                                pass
-                        _, alive = psutil.wait_procs(all_procs, timeout=30)
-                        for p in alive:
-                            p.kill()
-
-                        do_terminate = False
-
-                    proc.wait(timeout=3)
-                except subprocess.TimeoutExpired:
-                    pass
-                else:
-                    finished = True
-
-            sys.stdout.flush()
-            sys.stderr.flush()
-            if proc.returncode != 0:
-                logger.error(f"{cmd}  exited with status = {proc.returncode}")
-                return proc.returncode
-    else:
    server.serve(
        model_id,
        lora_adapters,
--- a/backends/gaudi/server/text_generation_server/habana_quantization_env.py
+++ b/backends/gaudi/server/text_generation_server/habana_quantization_env.py
@ -1,53 +0,0 @@
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
-
-import os
-import habana_frameworks.torch as htorch
-
-quant_config = os.getenv("QUANT_CONFIG", "")
-is_quantization_enabled = quant_config != ""
-
-if is_quantization_enabled:
-    os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
-    os.environ.setdefault("USE_DEFAULT_QUANT_PARAM", "true")
-    os.environ.setdefault("UPDATE_GRAPH_OUTPUT_MME", "false")
-    os.environ.setdefault("ENABLE_CALC_DYNAMIC_RANGE", "false")
-    os.environ.setdefault("UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
-    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
-
-
-def patch_scoped_linear_all_reduce(model):
-    from deepspeed.module_inject.layers import LinearAllreduce
-    from optimum.habana.transformers.models.modeling_all_models import (
-        ScopedLinearAllReduce,
-    )
-
-    for name, module in model.named_children():
-        if type(module) is LinearAllreduce:
-            SL = ScopedLinearAllReduce(mod=module)
-            setattr(model, name, SL)
-        patch_scoped_linear_all_reduce(module)
-
-
-def setup_quantization(model):
-    if is_quantization_enabled:
-        htorch.core.quantization._mark_params_as_const(model)
-        htorch.core.quantization._check_params_as_const(model)
-        htorch.core.hpu_initialize(model)
-    return model
-
-
-def prepare_model_for_quantization(model):
-    if is_quantization_enabled:
-        if model.config.model_type in [
-            "llama",
-            "falcon",
-            "qwen2",
-            "starcoder2",
-            "gemma",
-        ]:
-            patch_scoped_linear_all_reduce(model)
-        from neural_compressor.torch.quantization import FP8Config, convert
-
-        config = FP8Config.from_json_file(quant_config)
-        model = convert(model, config)
-    return model