Fixing sharding.

2025-09-12 04:44:52 +00:00 · 2024-07-02 15:37:27 +00:00 · 2024-07-02 15:37:27 +00:00 · b2fb845923
commit b2fb845923
parent 298500a08e
1 changed files with 3 additions and 1 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -881,7 +881,9 @@ class FlashCausalLM(Model):
        model = model_class(prefix, config, weights)
        torch.distributed.barrier(group=self.process_group)
        self.num_layers = config.num_hidden_layers
-        self.num_kv_heads = config.num_key_value_heads
+
+        # Validation is done in the model itself
+        self.num_kv_heads = config.num_key_value_heads // self.process_group.size()
        self.head_size = config.hidden_size // config.num_attention_heads

        self.cuda_graphs = {}