remove example

2025-09-11 04:14:52 +00:00 · 2024-06-25 07:08:37 +00:00 · 2024-06-25 07:08:37 +00:00 · f4714a8f98
commit f4714a8f98
parent 3ae62304ab
2 changed files with 0 additions and 137 deletions
--- a/examples/fp8_kvcache/README.md
+++ b/examples/fp8_kvcache/README.md
@ -1,40 +0,0 @@
-# FP8 (fp8_e4m3) KV Cache Scaling Factor Extraction Utility
-
-This utility is designed to extract KV cache scaling factors from a quantized `FP8(fp8_e4m3)` Hugging Face (HF) model. The extracted scaling factors are then saved to the corresponding unquantized HF model, which can be used with Text Generation Inference (TGI).
-
-Note: This tool specifically works with models quantized using the [AutoFP8](https://github.com/neuralmagic/AutoFP8/tree/main) repository.
-
-The KV scales are integrated into the unquantized HF model in the following format. The FP8 KV cache scaling factors are added to the FP16 checkpoints and specified through the .kv_scale parameter within the Attention module, as shown below:
-
-```
-model.layers.0.self_attn.kv_scale                < F32
-model.layers.1.self_attn.kv_scale                < F32
-...
-```
-
-## Prerequisites
-
- text-generation-server
- AutoFP8
-
-## CLI options
-```
-usage: extract_fp8_kv_scales.py [-h] [--quantized-model QUANTIZED_MODEL] [--model MODEL] [--save-path SAVE_PATH]
-
-Extract FP8 KV cache scales and add them to a FP16 model.
-
-options:
-  -h, --help            show this help message and exit
-  --quantized-model QUANTIZED_MODEL
-                        Path to the FP8 model checkpoint to extract KV cache scales
-  --model MODEL         Model ID of the FP16 model to save the KV cache scales
-  --save-path SAVE_PATH
-                        Path to save the FP16 model with the kv scales
-```
-
-## Example usage
-To extract KV cache scaling factors from a quantized FP8 model and save them to an unquantized FP16 model, use the following command:
-
-```
-python extract_fp8_kv_scales.py --quantized-model neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV --model meta-llama/Meta-Llama-3-8B-Instruct --save-path  Meta-Llama-3-8B-Instruct
-```
--- a/examples/fp8_kvcache/extract_fp8_kv_scales.py
+++ b/examples/fp8_kvcache/extract_fp8_kv_scales.py
@ -1,97 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from pathlib import Path
-from text_generation_server.utils.hub import (
-    weight_files,
-    download_weights,
-    weight_hub_files,
-)
-from safetensors import safe_open
-import argparse
-
-
-def load_model(ckpt_path):
-    model_args = {"torch_dtype": "auto"}
-
-    model = AutoModelForCausalLM.from_pretrained(
-        ckpt_path, device_map="auto", **model_args, trust_remote_code=True
-    )
-    model.eval()
-
-    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
-
-    return model, tokenizer
-
-
-def set_nested_attribute(obj, attribute_path, value):
-    keys = attribute_path.split(".")
-    current_obj = obj
-    for key in keys[:-1]:
-        current_obj = getattr(current_obj, key)
-    setattr(current_obj, keys[-1], value)
-
-
-def apply_kv_scales_to_model(model, layer_scales_map):
-    for layer_name, scale_value in layer_scales_map.items():
-        scale_param = torch.nn.Parameter(torch.tensor(scale_value), requires_grad=False)
-        set_nested_attribute(model, layer_name, scale_param)
-
-
-def extract_kv_scales(quantized_model):
-    def fetch_parameters(filename):
-        with safe_open(filename, framework="pt") as f:
-            for name in f.keys():
-                param_tensor = f.get_tensor(name)
-                yield name, param_tensor
-
-    checkpoint_dir = Path(quantized_model)
-    if not checkpoint_dir.is_dir():
-        hub_filenames = weight_hub_files(quantized_model)
-        downloaded_files = download_weights(hub_filenames, quantized_model)
-    downloaded_files = weight_files(quantized_model, extension=".safetensors")
-
-    layer_scales_map = {}
-    for tensor_file in downloaded_files:
-        for name, param in fetch_parameters(tensor_file):
-            if ".kv_scale" in name:
-                layer_scales_map[name] = param.item()
-
-    return layer_scales_map
-
-
-def main(quantized_model, model_id, save_path):
-    layer_scales_map = extract_kv_scales(quantized_model)
-
-    model, tokenizer = load_model(model_id)
-
-    apply_kv_scales_to_model(model, layer_scales_map)
-
-    model.save_pretrained(save_path)
-    tokenizer.save_pretrained(save_path)
-
-    print(f"Model saved to {save_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Extract FP8 KV cache scales and add them to a FP16 model."
-    )
-    parser.add_argument(
-        "--quantized-model",
-        type=str,
-        help="Path to the FP8 model checkpoint to extract KV cache scales",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        help="Model ID of the FP16 model to save the KV cache scales",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        help="Path to save the FP16 model with the kv scales",
-    )
-
-    args = parser.parse_args()
-
-    main(args.quantized_model, args.model, args.save_path)