feat: adjust attn weight loading logic (#1975)

This PR updates `load_attention` to prefer loading specific attention based on the model type. Additionally there were two cases where `TensorParallelColumnLinear.load_multi` was called and this reduces it to a single path
2025-09-18 07:44:53 +00:00 · 2024-05-29 12:42:11 -04:00 · 2024-05-29 12:42:11 -04:00 · cbced7f0f9
commit cbced7f0f9
parent 612bc483b6
1 changed files with 21 additions and 27 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -49,37 +49,31 @@ if SYSTEM == "rocm":
 def load_attention(config, prefix, weights):
    bias = config.attention_bias
-    if config.num_attention_heads != config.num_key_value_heads:
+
-        return TensorParallelColumnLinear.load_multi(
+    # if specific model type, load the correct attention
    if config.model_type == "phi3":
        return TensorParallelColumnLinear.load_qkv(
            config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            prefix=f"{prefix}.qkv_proj",
            dim=0,
            weights=weights,
            bias=bias,
        )
-    else:
+    elif config.model_type == "baichuan":
-        if config.model_type == "baichuan":
+        return TensorParallelColumnLinear.load_qkv(
-            return TensorParallelColumnLinear.load_qkv(
+            config,
-                config,
+            prefix=f"{prefix}.W_pack",
-                prefix=f"{prefix}.W_pack",
+            weights=weights,
-                weights=weights,
+            bias=bias,
-                bias=bias,
+        )
-            )
+
-        elif config.model_type == "phi3":
+    # otherwise, load the default attention based on the number of heads
-            return TensorParallelColumnLinear.load_qkv(
+    return TensorParallelColumnLinear.load_multi(
-                config,
+        config,
-                prefix=f"{prefix}.qkv_proj",
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-                weights=weights,
+        dim=0,
-                bias=bias,
+        weights=weights,
-            )
+        bias=bias,
-        else:
+    )
            return TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
                bias=bias,
            )
 class FlashLlamaAttention(torch.nn.Module):