fix style

This commit is contained in:
Mohit Sharma 2024-06-24 14:30:26 +00:00
parent 8a0bb53ef3
commit 557e18e08c
12 changed files with 47 additions and 14 deletions

View File

@ -213,5 +213,5 @@ FROM base-copy
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh
ENTRYPOINT ["/tgi-entrypoint.sh"]
CMD ["--json-output"]
# ENTRYPOINT ["/tgi-entrypoint.sh"]
# CMD ["--json-output"]

View File

@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize:
clean:
rm -rf target aml
interact:
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 64g --net host -v /home/mohit/.cache/huggingface/hub/:/data -v $(PWD):/tgi tgi-mht

View File

@ -234,7 +234,7 @@ Options:
--hostname <HOSTNAME>
The IP address to listen on
[env: HOSTNAME=hf-amd-mi250-dev]
[env: HOSTNAME=]
[default: 0.0.0.0]
```
@ -279,7 +279,7 @@ Options:
--huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
[env: HUGGINGFACE_HUB_CACHE=/data]
[env: HUGGINGFACE_HUB_CACHE=]
```
## WEIGHTS_CACHE_OVERRIDE

View File

@ -94,7 +94,9 @@ def serve(
if kv_cache_dtype in {"fp8", "fp8_e5m2"}:
if SYSTEM not in {"cuda", "rocm"}:
raise RuntimeError(f"`{kv_cache_dtype}` KV cache is only supported on Nvidia and AMD GPUs.")
raise RuntimeError(
f"`{kv_cache_dtype}` KV cache is only supported on Nvidia and AMD GPUs."
)
if kv_cache_dtype == "fp8_e5m2" and SYSTEM != "cuda":
raise RuntimeError(f"`fp8_e5m2` KV cache is only supported on Nvidia GPUs.")

View File

@ -23,7 +23,9 @@ def reshape_and_cache(
kv_cache_dtype: str = "auto",
kv_scale: int = 1.0,
):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale)
cache_ops.reshape_and_cache(
key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale
)
def paged_attention(

View File

@ -28,7 +28,9 @@ def reshape_and_cache(
kv_cache_dtype: str = "auto",
kv_scale: int = 1.0,
):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale)
cache_ops.reshape_and_cache(
key, value, key_cache, value_cache, slots, kv_cache_dtype, kv_scale
)
def paged_attention(

View File

@ -114,6 +114,7 @@ except ImportError as e:
if MAMBA_AVAILABLE:
__all__.append(Mamba)
class ModelType(enum.Enum):
IDEFICS2 = {
"type": "idefics2",
@ -244,6 +245,7 @@ class ModelType(enum.Enum):
"multimodal": True,
}
FP8_KVCACHE_SUPPORTED_MODELS = {
"llama",
"baichun",

View File

@ -45,6 +45,7 @@ from text_generation_server.layers.layernorm import (
)
from loguru import logger
if SYSTEM == "rocm":
try:
from vllm import _custom_C
@ -138,7 +139,9 @@ class FlashLlamaAttention(torch.nn.Module):
self.kv_cache_dtype = config.kv_cache_dtype
if self.kv_cache_dtype == "fp8":
self.kv_scale = weights.get_kv_cache_scaling_factor(prefix, self.kv_cache_dtype)
self.kv_scale = weights.get_kv_cache_scaling_factor(
prefix, self.kv_cache_dtype
)
else:
self.kv_scale = 1.0
logger.info(f"kv_cache_dtype: {self.kv_cache_dtype}, kv_scale: {self.kv_scale}")
@ -168,7 +171,15 @@ class FlashLlamaAttention(torch.nn.Module):
self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots, self.kv_cache_dtype, self.kv_scale)
reshape_and_cache(
kv[:, 0],
kv[:, 1],
kv_cache[0],
kv_cache[1],
slots,
self.kv_cache_dtype,
self.kv_scale,
)
# output tensor
attn_output = torch.empty_like(query)

View File

@ -269,6 +269,13 @@ def serve(
set_model_id(model_id)
asyncio.run(
serve_inner(
model_id, revision, sharded, quantize, speculate, dtype, kv_cache_dtype, trust_remote_code
model_id,
revision,
sharded,
quantize,
speculate,
dtype,
kv_cache_dtype,
trust_remote_code,
)
)

View File

@ -89,7 +89,11 @@ class Weights:
# Special case for gptq which shouldn't convert
# u4 which are disguised as int32. Exl2 uses int16
# as well.
if tensor.dtype not in [torch.int16, torch.int32,torch.int64] and not tensor_name.endswith("kv_scale"):
if tensor.dtype not in [
torch.int16,
torch.int32,
torch.int64,
] and not tensor_name.endswith("kv_scale"):
tensor = tensor.to(dtype=self.dtype)
if to_device:
tensor = tensor.to(device=self.device)