From 0c3239e710a675e34b6e4ebdc8104416e1efc8a5 Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Mon, 26 Aug 2024 01:55:37 -0700
Subject: [PATCH] Enable quantization with INC  (#203)

---
 README.md                                         |  2 +-
 .../habana_quantization_env.py                    | 15 ++++++++++++++-
 server/text_generation_server/models/causal_lm.py | 13 +++----------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 01ebe72b..74ee5f39 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ For more information and documentation about Text Generation Inference, checkout
 
 ## Running TGI with FP8 precision
 
-TGI supports FP8 precision runs within the limits provided by [Habana Quantization Toolkit](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). Summarising that instruction in TGI cases:
+TGI supports FP8 precision runs within the limits provided by [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). From 2.0.4 release, Intel Neural Compressor (INC) is used by default for measuring and quantization. Habana Quantization Toolkit(HQT) will be removed in future releases. To use HQT, disable INC by setting `-e USE_INC=0`. Summarising that instruction in TGI cases:
 
 1. Measure quantization statistics of requested model by using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8:~:text=use_deepspeed%20%2D%2Dworld_size%208-,run_lm_eval.py,-%5C%0A%2Do%20acc_70b_bs1_measure.txt)
 2. Run requested model in TGI with proper QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.
diff --git a/server/text_generation_server/habana_quantization_env.py b/server/text_generation_server/habana_quantization_env.py
index d92b6803..a854e45a 100644
--- a/server/text_generation_server/habana_quantization_env.py
+++ b/server/text_generation_server/habana_quantization_env.py
@@ -5,7 +5,8 @@ import sys
 
 assert "habana_frameworks" not in sys.modules
 
-is_quantization_enabled = os.getenv("QUANT_CONFIG", "") != ""
+quant_config = os.getenv("QUANT_CONFIG", "")
+is_quantization_enabled = quant_config != ""
 
 if is_quantization_enabled:
     os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
@@ -15,3 +16,15 @@ if is_quantization_enabled:
     os.environ.setdefault(
         "UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
     os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+
+def prepare_model_for_quantization(model):
+    if is_quantization_enabled:
+        if os.getenv("USE_INC", "1") != "0":
+            from neural_compressor.torch.quantization import FP8Config, convert
+            config = FP8Config.from_json_file(quant_config)
+            model = convert(model, config)
+        else:
+            import habana_quantization_toolkit
+            habana_quantization_toolkit.prep_model(model)
+        return model
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index c6192be0..831b0b5e 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -658,7 +658,7 @@ class CausalLM(Model):
             from habana_frameworks.torch.hpu import wrap_in_hpu_graph
             model = wrap_in_hpu_graph(model, disable_tensor_cache=True)
         else:
-            if LAZY_MODE == 0: 
+            if LAZY_MODE == 0:
                 # It is said that "keep_input_mutations" is safe for inference to be done
                 dbg_trace(
                     "TORCH COMPILE", f'Torch compiling of model')
@@ -807,14 +807,7 @@ class CausalLM(Model):
         if hq_env.is_quantization_enabled:
             if model.config.model_type == "llama":
                 self.patch_scoped_linear_all_reduce(model)
-            import habana_quantization_toolkit
-            habana_quantization_toolkit.prep_model(model)
-        return model
-
-    def finish_quantization_measurements(self, model):
-        if hq_env.is_quantization_enabled:
-            import habana_quantization_toolkit
-            habana_quantization_toolkit.finish_measurements(self.model)
+            model = hq_env.prepare_model_for_quantization(model)
         return model
 
     def patch_scoped_linear_all_reduce(self, model):
@@ -995,7 +988,7 @@ class CausalLM(Model):
                 bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None,
             )
         elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
-            # Don't schedule next forward if max_new_tokens for all requests equals 1 
+            # Don't schedule next forward if max_new_tokens for all requests equals 1
             # - we've already generated the first and only needed token in the prefill phase
             pass
         else: