text-generation-inference/backends/gaudi/server/text_generation_server/utils/quantization.py

import json
import os
from dataclasses import dataclass
from typing import Optional

from huggingface_hub import hf_hub_download
from text_generation_server.utils.weights import (
    WeightsLoader,
)


# TODO: Split this config to have a single config type per quant method
@dataclass
class _QuantizerConfig:
    bits: int
    checkpoint_format: Optional[str]
    desc_act: bool
    groupsize: int
    quant_method: str
    sym: bool


@dataclass
class _FP8QuantizerConfig:
    activation_scale_ub: float


# We should probably do this with Pytantic JSON deserialization,
# but for now we'll stay close to the old _set_gptq_params.
def _get_quantizer_config(model_id, revision):
    bits = 4
    groupsize = -1
    quant_method = "gptq"
    checkpoint_format = None
    sym = False
    desc_act = False

    filename = "config.json"
    try:
        if os.path.exists(os.path.join(model_id, filename)):
            filename = os.path.join(model_id, filename)
        else:
            filename = hf_hub_download(model_id, filename=filename, revision=revision)
        with open(filename, "r") as f:
            data = json.load(f)

        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
            return _FP8QuantizerConfig(
                activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
            )

        if "zero_point" in data["quantization_config"]:
            sym = not data["quantization_config"]["zero_point"]
            quant_method = "awq"
        elif "sym" in data["quantization_config"]:
            sym = data["quantization_config"]["sym"]

        bits = data["quantization_config"]["bits"]
        groupsize = data["quantization_config"]["group_size"]
        # Order is important here, desc_act is missing on some real models
        quant_method = data["quantization_config"]["quant_method"]
        checkpoint_format = data["quantization_config"].get("checkpoint_format")
        desc_act = data["quantization_config"]["desc_act"]
    except Exception:
        filename = "quantize_config.json"
        try:
            if os.path.exists(os.path.join(model_id, filename)):
                filename = os.path.join(model_id, filename)
            else:
                filename = hf_hub_download(
                    model_id, filename=filename, revision=revision
                )
            with open(filename, "r") as f:
                data = json.load(f)
            bits = data["bits"]
            groupsize = data["group_size"]

            if "zero_point" in data:
                sym = not data["zero_point"]
                quant_method = "awq"
            elif "sym" in data:
                sym = data["sym"]

            desc_act = data["desc_act"]
            if "version" in data and data["version"] == "GEMM":
                quant_method = "awq"
        except Exception:
            filename = "quant_config.json"
            try:
                if os.path.exists(os.path.join(model_id, filename)):
                    filename = os.path.join(model_id, filename)
                else:
                    filename = hf_hub_download(
                        model_id, filename=filename, revision=revision
                    )
                with open(filename, "r") as f:
                    data = json.load(f)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
                if "version" in data and data["version"] == "GEMM":
                    quant_method = "awq"
            except Exception:
                pass

    return _QuantizerConfig(
        bits=bits,
        groupsize=groupsize,
        quant_method=quant_method,
        checkpoint_format=checkpoint_format,
        sym=sym,
        desc_act=desc_act,
    )


def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
) -> WeightsLoader:
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader

        # TODO: improve check once we have one config type per quantize value
        if not isinstance(quantizer_config, _QuantizerConfig):
            raise ValueError(
                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
            )

        return GPTQWeightsLoader(
            bits=quantizer_config.bits,
            desc_act=quantizer_config.desc_act,
            groupsize=quantizer_config.groupsize,
            quant_method=quantizer_config.quant_method,
            quantize=quantize,
            sym=quantizer_config.sym,
        )
    elif quantize == "fp8" or quantize is None:
        from text_generation_server.layers.fp8 import HybridFP8UnquantLoader

        # Since the default for the quantize config is _QuantizerConfig,
        # we need to add this check to not get an attribute error
        activation_scale_ub = None
        if isinstance(quantizer_config, _FP8QuantizerConfig):
            activation_scale_ub = quantizer_config.activation_scale_ub

        return HybridFP8UnquantLoader(activation_scale_ub, to_fp8=quantize == "fp8")
    else:
        raise ValueError(f"Unknown quantization method: {quantize}")
Add Gaudi Backend (#3055) * wip(gaudi): import server and dockerfile from tgi-gaudi fork * feat(gaudi): new gaudi backend working * fix: fix style * fix prehooks issues * fix(gaudi): refactor server and implement requested changes 2025-02-28 11:14:58 +00:00			`import json`
			`import os`
			`from dataclasses import dataclass`
			`from typing import Optional`

			`from huggingface_hub import hf_hub_download`
			`from text_generation_server.utils.weights import (`
			`WeightsLoader,`
			`)`


			`# TODO: Split this config to have a single config type per quant method`
			`@dataclass`
			`class _QuantizerConfig:`
			`bits: int`
			`checkpoint_format: Optional[str]`
			`desc_act: bool`
			`groupsize: int`
			`quant_method: str`
			`sym: bool`


			`@dataclass`
			`class _FP8QuantizerConfig:`
			`activation_scale_ub: float`


			`# We should probably do this with Pytantic JSON deserialization,`
			`# but for now we'll stay close to the old _set_gptq_params.`
			`def _get_quantizer_config(model_id, revision):`
			`bits = 4`
			`groupsize = -1`
			`quant_method = "gptq"`
			`checkpoint_format = None`
			`sym = False`
			`desc_act = False`

			`filename = "config.json"`
			`try:`
			`if os.path.exists(os.path.join(model_id, filename)):`
			`filename = os.path.join(model_id, filename)`
			`else:`
			`filename = hf_hub_download(model_id, filename=filename, revision=revision)`
			`with open(filename, "r") as f:`
			`data = json.load(f)`

			`# FP8 config`
			`if data["quantization_config"]["quant_method"] == "fbgemm_fp8":`
			`return _FP8QuantizerConfig(`
			`activation_scale_ub=data["quantization_config"]["activation_scale_ub"]`
			`)`

			`if "zero_point" in data["quantization_config"]:`
			`sym = not data["quantization_config"]["zero_point"]`
			`quant_method = "awq"`
			`elif "sym" in data["quantization_config"]:`
			`sym = data["quantization_config"]["sym"]`

			`bits = data["quantization_config"]["bits"]`
			`groupsize = data["quantization_config"]["group_size"]`
			`# Order is important here, desc_act is missing on some real models`
			`quant_method = data["quantization_config"]["quant_method"]`
			`checkpoint_format = data["quantization_config"].get("checkpoint_format")`
			`desc_act = data["quantization_config"]["desc_act"]`
			`except Exception:`
			`filename = "quantize_config.json"`
			`try:`
			`if os.path.exists(os.path.join(model_id, filename)):`
			`filename = os.path.join(model_id, filename)`
			`else:`
			`filename = hf_hub_download(`
			`model_id, filename=filename, revision=revision`
			`)`
			`with open(filename, "r") as f:`
			`data = json.load(f)`
			`bits = data["bits"]`
			`groupsize = data["group_size"]`

			`if "zero_point" in data:`
			`sym = not data["zero_point"]`
			`quant_method = "awq"`
			`elif "sym" in data:`
			`sym = data["sym"]`

			`desc_act = data["desc_act"]`
			`if "version" in data and data["version"] == "GEMM":`
			`quant_method = "awq"`
			`except Exception:`
			`filename = "quant_config.json"`
			`try:`
			`if os.path.exists(os.path.join(model_id, filename)):`
			`filename = os.path.join(model_id, filename)`
			`else:`
			`filename = hf_hub_download(`
			`model_id, filename=filename, revision=revision`
			`)`
			`with open(filename, "r") as f:`
			`data = json.load(f)`
			`bits = data["w_bit"]`
			`groupsize = data["q_group_size"]`
			`desc_act = data["desc_act"]`
			`if "version" in data and data["version"] == "GEMM":`
			`quant_method = "awq"`
			`except Exception:`
			`pass`

			`return _QuantizerConfig(`
			`bits=bits,`
			`groupsize=groupsize,`
			`quant_method=quant_method,`
			`checkpoint_format=checkpoint_format,`
			`sym=sym,`
			`desc_act=desc_act,`
			`)`


			`def get_loader(`
			`quantize: Optional[str], model_id: str, revision: Optional[str]`
			`) -> WeightsLoader:`
			`quantizer_config = _get_quantizer_config(model_id, revision)`
			`if quantize in {"awq", "gptq"}:`
			`from text_generation_server.layers.gptq import GPTQWeightsLoader`

			`# TODO: improve check once we have one config type per quantize value`
			`if not isinstance(quantizer_config, _QuantizerConfig):`
			`raise ValueError(`
			f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
			`)`

Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113) * clean cuda/rocm code in hpu backend, enable flat_hpu Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix TP in pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust block table in hpu to improve performance Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable all the model. not testet yet Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use tensor cache in hpu graph to avoid replay issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add moe support, fix qwen/mistral/mixtral crash Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix phimoe issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * gpt_bigcode could also go pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable dbrx remove some unused code Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * multi-modality initial PR Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust warmup and enable vlm Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix incorrect output in qwen2 idefics if hpu graph is used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove unused quantization code and enable awq/gptq int4 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix gptq issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable fp8 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup prefill remove model where pageattn is not used, set block table to None since it's not used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add warmup_decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove block_tables and prefill_cache_indices which will lead to dynamic shape Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * missing gptj change... Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix some issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove torch.where to fix incorrect output in hpu graph model Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * match the latest vllm_extension ops Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> 2025-04-14 13:58:13 +00:00			`return GPTQWeightsLoader(`
Add Gaudi Backend (#3055) * wip(gaudi): import server and dockerfile from tgi-gaudi fork * feat(gaudi): new gaudi backend working * fix: fix style * fix prehooks issues * fix(gaudi): refactor server and implement requested changes 2025-02-28 11:14:58 +00:00			`bits=quantizer_config.bits,`
Gaudi: clean cuda/rocm code in hpu backend, enable flat_hpu (#3113) * clean cuda/rocm code in hpu backend, enable flat_hpu Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix TP in pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust block table in hpu to improve performance Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable all the model. not testet yet Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * use tensor cache in hpu graph to avoid replay issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add moe support, fix qwen/mistral/mixtral crash Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix phimoe issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * gpt_bigcode could also go pageattn Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable dbrx remove some unused code Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * multi-modality initial PR Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * adjust warmup and enable vlm Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix incorrect output in qwen2 idefics if hpu graph is used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove unused quantization code and enable awq/gptq int4 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix gptq issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * enable fp8 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup prefill remove model where pageattn is not used, set block table to None since it's not used Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * add warmup_decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * warmup decode Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove block_tables and prefill_cache_indices which will lead to dynamic shape Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix comment Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * missing gptj change... Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix some issue Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * remove torch.where to fix incorrect output in hpu graph model Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * match the latest vllm_extension ops Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> 2025-04-14 13:58:13 +00:00			`desc_act=quantizer_config.desc_act,`
Add Gaudi Backend (#3055) * wip(gaudi): import server and dockerfile from tgi-gaudi fork * feat(gaudi): new gaudi backend working * fix: fix style * fix prehooks issues * fix(gaudi): refactor server and implement requested changes 2025-02-28 11:14:58 +00:00			`groupsize=quantizer_config.groupsize,`
			`quant_method=quantizer_config.quant_method,`
			`quantize=quantize,`
			`sym=quantizer_config.sym,`
			`)`
			`elif quantize == "fp8" or quantize is None:`
			`from text_generation_server.layers.fp8 import HybridFP8UnquantLoader`

			`# Since the default for the quantize config is _QuantizerConfig,`
			`# we need to add this check to not get an attribute error`
			`activation_scale_ub = None`
			`if isinstance(quantizer_config, _FP8QuantizerConfig):`
			`activation_scale_ub = quantizer_config.activation_scale_ub`

			`return HybridFP8UnquantLoader(activation_scale_ub, to_fp8=quantize == "fp8")`
			`else:`
			`raise ValueError(f"Unknown quantization method: {quantize}")`