From 5829b7821ea0fe69d09069da911d65620ab0d396 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 22 Jul 2024 13:49:24 +0000 Subject: [PATCH] Less clutter. --- server/text_generation_server/models/__init__.py | 2 ++ server/text_generation_server/models/flash_causal_lm.py | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index a43cdfed..4bcb657d 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -757,6 +757,8 @@ def get_model( default_dtype=torch.bfloat16, trust_remote_code=trust_remote_code, lora_adapter_ids=lora_adapter_ids, + # hidden_size / num_attention_heads is wrong in `google/gemma-2-9b-it` + head_size=config.head_dim, ) elif sharded: raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2")) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 7a01fa7e..cfffafa1 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -925,11 +925,7 @@ class FlashCausalLM(Model): assert self.num_kv_heads > 0 if head_size is None: - if getattr(config, "head_dim", None): - # hidden_size / num_attention_heads is wrong in `google/gemma-2-9b-it` - self.head_size = config.head_dim - else: - self.head_size = config.hidden_size // config.num_attention_heads + self.head_size = config.hidden_size // config.num_attention_heads else: self.head_size = head_size