mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
Enable quantization with INC (#203)
This commit is contained in:
parent
ea48ae169a
commit
0c3239e710
@ -109,7 +109,7 @@ For more information and documentation about Text Generation Inference, checkout
|
||||
|
||||
## Running TGI with FP8 precision
|
||||
|
||||
TGI supports FP8 precision runs within the limits provided by [Habana Quantization Toolkit](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). Summarising that instruction in TGI cases:
|
||||
TGI supports FP8 precision runs within the limits provided by [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). Models with FP8 can be ran by properly setting QUANT_CONFIG environment variable. Detailed instruction on how to use that variable can be found in [Optimum Habana FP8 guide](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8). From 2.0.4 release, Intel Neural Compressor (INC) is used by default for measuring and quantization. Habana Quantization Toolkit(HQT) will be removed in future releases. To use HQT, disable INC by setting `-e USE_INC=0`. Summarising that instruction in TGI cases:
|
||||
|
||||
1. Measure quantization statistics of requested model by using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8:~:text=use_deepspeed%20%2D%2Dworld_size%208-,run_lm_eval.py,-%5C%0A%2Do%20acc_70b_bs1_measure.txt)
|
||||
2. Run requested model in TGI with proper QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.
|
||||
|
@ -5,7 +5,8 @@ import sys
|
||||
|
||||
assert "habana_frameworks" not in sys.modules
|
||||
|
||||
is_quantization_enabled = os.getenv("QUANT_CONFIG", "") != ""
|
||||
quant_config = os.getenv("QUANT_CONFIG", "")
|
||||
is_quantization_enabled = quant_config != ""
|
||||
|
||||
if is_quantization_enabled:
|
||||
os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
|
||||
@ -15,3 +16,15 @@ if is_quantization_enabled:
|
||||
os.environ.setdefault(
|
||||
"UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
|
||||
os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
|
||||
|
||||
|
||||
def prepare_model_for_quantization(model):
|
||||
if is_quantization_enabled:
|
||||
if os.getenv("USE_INC", "1") != "0":
|
||||
from neural_compressor.torch.quantization import FP8Config, convert
|
||||
config = FP8Config.from_json_file(quant_config)
|
||||
model = convert(model, config)
|
||||
else:
|
||||
import habana_quantization_toolkit
|
||||
habana_quantization_toolkit.prep_model(model)
|
||||
return model
|
||||
|
@ -807,14 +807,7 @@ class CausalLM(Model):
|
||||
if hq_env.is_quantization_enabled:
|
||||
if model.config.model_type == "llama":
|
||||
self.patch_scoped_linear_all_reduce(model)
|
||||
import habana_quantization_toolkit
|
||||
habana_quantization_toolkit.prep_model(model)
|
||||
return model
|
||||
|
||||
def finish_quantization_measurements(self, model):
|
||||
if hq_env.is_quantization_enabled:
|
||||
import habana_quantization_toolkit
|
||||
habana_quantization_toolkit.finish_measurements(self.model)
|
||||
model = hq_env.prepare_model_for_quantization(model)
|
||||
return model
|
||||
|
||||
def patch_scoped_linear_all_reduce(self, model):
|
||||
|
Loading…
Reference in New Issue
Block a user