diff --git a/Dockerfile_amd b/Dockerfile_amd index 55da92046..421bf942e 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -213,5 +213,5 @@ FROM base-copy COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh RUN chmod +x /tgi-entrypoint.sh -ENTRYPOINT ["/tgi-entrypoint.sh"] -CMD ["--json-output"] +# ENTRYPOINT ["/tgi-entrypoint.sh"] +# CMD ["--json-output"] diff --git a/Makefile b/Makefile index a1399b6d7..24dd4d6f3 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize: clean: rm -rf target aml + +interact: + docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host -v /home/mohit/.cache/huggingface/hub/:/data -v $(PWD):/tgi tgi-mht diff --git a/docs/source/basic_tutorials/fp_kv_cache.md b/docs/source/basic_tutorials/fp_kv_cache.md index 70b0a292c..64e4539f0 100644 --- a/docs/source/basic_tutorials/fp_kv_cache.md +++ b/docs/source/basic_tutorials/fp_kv_cache.md @@ -58,4 +58,4 @@ Use [AutoFP8](https://github.com/neuralmagic/AutoFP8) with calibration data to g TGI provides a utility to extract the FP8 KV cache scales from an `AutoFP8` quantized model and save them to the FP16 model for use with TGI. For more information: -Alternatively, you can use other quantizer tools, such as Nvidia AMMO, to obtain these scaling factors. \ No newline at end of file +Alternatively, you can use other quantizer tools, such as Nvidia AMMO, to obtain these scaling factors. diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md index db088a395..2d2b70632 100644 --- a/docs/source/basic_tutorials/launcher.md +++ b/docs/source/basic_tutorials/launcher.md @@ -234,7 +234,7 @@ Options: --hostname The IP address to listen on - [env: HOSTNAME=hf-amd-mi250-dev] + [env: HOSTNAME=] [default: 0.0.0.0] ``` @@ -279,7 +279,7 @@ Options: --huggingface-hub-cache The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance - [env: HUGGINGFACE_HUB_CACHE=/data] + [env: HUGGINGFACE_HUB_CACHE=] ``` ## WEIGHTS_CACHE_OVERRIDE diff --git a/examples/fp8_kvcache/README.md b/examples/fp8_kvcache/README.md index d2d99ab7b..fd476a3a1 100644 --- a/examples/fp8_kvcache/README.md +++ b/examples/fp8_kvcache/README.md @@ -37,4 +37,4 @@ To extract KV cache scaling factors from a quantized FP8 model and save them to ``` python extract_fp8_kv_scales.py --quantized-model neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV --model meta-llama/Meta-Llama-3-8B-Instruct --save-path Meta-Llama-3-8B-Instruct -``` \ No newline at end of file +``` diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index a96bc22c1..9ff725696 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -91,10 +91,12 @@ def serve( raise RuntimeError( "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model." ) - + if kv_cache_dtype in {"fp8", "fp8_e5m2"}: if SYSTEM not in {"cuda", "rocm"}: - raise RuntimeError(f"`{kv_cache_dtype}` KV cache is only supported on Nvidia and AMD GPUs.") + raise RuntimeError( + f"`{kv_cache_dtype}` KV cache is only supported on Nvidia and AMD GPUs." + ) if kv_cache_dtype == "fp8_e5m2" and SYSTEM != "cuda": raise RuntimeError(f"`fp8_e5m2` KV cache is only supported on Nvidia GPUs.") diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index 242b6fa10..e55a0e771 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -23,7 +23,9 @@ def reshape_and_cache( kv_cache_dtype: str = "auto", kv_scale: int = 1.0, ): - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale) + cache_ops.reshape_and_cache( + key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale + ) def paged_attention( diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py index 9cf3c4f88..e407e1a87 100644 --- a/server/text_generation_server/layers/attention/rocm.py +++ b/server/text_generation_server/layers/attention/rocm.py @@ -28,7 +28,9 @@ def reshape_and_cache( kv_cache_dtype: str = "auto", kv_scale: int = 1.0, ): - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale) + cache_ops.reshape_and_cache( + key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale + ) def paged_attention( diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 17b5504cc..a2c240d98 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -114,6 +114,7 @@ except ImportError as e: if MAMBA_AVAILABLE: __all__.append(Mamba) + class ModelType(enum.Enum): IDEFICS2 = { "type": "idefics2", @@ -244,6 +245,7 @@ class ModelType(enum.Enum): "multimodal": True, } + FP8_KVCACHE_SUPPORTED_MODELS = { "llama", "baichun", diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 75ab49067..180f59332 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -45,6 +45,7 @@ from text_generation_server.layers.layernorm import ( ) from loguru import logger + if SYSTEM == "rocm": try: from vllm import _custom_C @@ -138,7 +139,9 @@ class FlashLlamaAttention(torch.nn.Module): self.kv_cache_dtype = config.kv_cache_dtype if self.kv_cache_dtype == "fp8": - self.kv_scale = weights.get_kv_cache_scaling_factor(prefix, self.kv_cache_dtype) + self.kv_scale = weights.get_kv_cache_scaling_factor( + prefix, self.kv_cache_dtype + ) else: self.kv_scale = 1.0 logger.info(f"kv_cache_dtype: {self.kv_cache_dtype}, kv_scale: {self.kv_scale}") @@ -168,7 +171,15 @@ class FlashLlamaAttention(torch.nn.Module): self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots, self.kv_cache_dtype, self.kv_scale) + reshape_and_cache( + kv[:, 0], + kv[:, 1], + kv_cache[0], + kv_cache[1], + slots, + self.kv_cache_dtype, + self.kv_scale, + ) # output tensor attn_output = torch.empty_like(query) diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index aca965524..d3fc464c7 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -269,6 +269,13 @@ def serve( set_model_id(model_id) asyncio.run( serve_inner( - model_id, revision, sharded, quantize, speculate, dtype, kv_cache_dtype, trust_remote_code + model_id, + revision, + sharded, + quantize, + speculate, + dtype, + kv_cache_dtype, + trust_remote_code, ) ) diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index 8e8efe976..7db325388 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -89,7 +89,11 @@ class Weights: # Special case for gptq which shouldn't convert # u4 which are disguised as int32. Exl2 uses int16 # as well. - if tensor.dtype not in [torch.int16, torch.int32,torch.int64] and not tensor_name.endswith("kv_scale"): + if tensor.dtype not in [ + torch.int16, + torch.int32, + torch.int64, + ] and not tensor_name.endswith("kv_scale"): tensor = tensor.to(dtype=self.dtype) if to_device: tensor = tensor.to(device=self.device)