Enable quantization with INC (#203)

2025-07-10 18:00:16 +00:00 · 2024-08-26 01:55:37 -07:00 · 2024-08-26 01:55:37 -07:00 · 0c3239e710
commit 0c3239e710
parent ea48ae169a
3 changed files with 18 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -109,7 +109,7 @@ For more information and documentation about Text Generation Inference, checkout

 ## Running TGI with FP8 precision

-TGI supports FP8 precision runs within the limits provided by [Habana Quantization Toolkit](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). Summarising that instruction in TGI cases:
+TGI supports FP8 precision runs within the limits provided by [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). From 2.0.4 release, Intel Neural Compressor (INC) is used by default for measuring and quantization. Habana Quantization Toolkit(HQT) will be removed in future releases. To use HQT, disable INC by setting `-e USE_INC=0`. Summarising that instruction in TGI cases:

 1. Measure quantization statistics of requested model by using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8:~:text=use_deepspeed%20%2D%2Dworld_size%208-,run_lm_eval.py,-%5C%0A%2Do%20acc_70b_bs1_measure.txt)
 2. Run requested model in TGI with proper QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.
--- a/server/text_generation_server/habana_quantization_env.py
+++ b/server/text_generation_server/habana_quantization_env.py
@ -5,7 +5,8 @@ import sys

 assert "habana_frameworks" not in sys.modules

-is_quantization_enabled = os.getenv("QUANT_CONFIG", "") != ""
+quant_config = os.getenv("QUANT_CONFIG", "")
+is_quantization_enabled = quant_config != ""

 if is_quantization_enabled:
    os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
@ -15,3 +16,15 @@ if is_quantization_enabled:
    os.environ.setdefault(
        "UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+
+def prepare_model_for_quantization(model):
+    if is_quantization_enabled:
+        if os.getenv("USE_INC", "1") != "0":
+            from neural_compressor.torch.quantization import FP8Config, convert
+            config = FP8Config.from_json_file(quant_config)
+            model = convert(model, config)
+        else:
+            import habana_quantization_toolkit
+            habana_quantization_toolkit.prep_model(model)
+        return model
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -807,14 +807,7 @@ class CausalLM(Model):
        if hq_env.is_quantization_enabled:
            if model.config.model_type == "llama":
                self.patch_scoped_linear_all_reduce(model)
-            import habana_quantization_toolkit
-            habana_quantization_toolkit.prep_model(model)
-        return model
-
-    def finish_quantization_measurements(self, model):
-        if hq_env.is_quantization_enabled:
-            import habana_quantization_toolkit
-            habana_quantization_toolkit.finish_measurements(self.model)
+            model = hq_env.prepare_model_for_quantization(model)
        return model

    def patch_scoped_linear_all_reduce(self, model):