Phi3 support.

2025-09-11 12:24:53 +00:00 · 2024-04-23 08:50:18 +00:00 · 2024-04-23 08:50:18 +00:00 · 7d31cb6e75
commit 7d31cb6e75
parent ed72e92126
2 changed files with 23 additions and 8 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -327,7 +327,7 @@ def get_model(
                trust_remote_code=trust_remote_code,
            )

-    elif model_type == "llama" or model_type == "baichuan":
+    elif model_type == "llama" or model_type == "baichuan" or model_type == "phi3":
        if FLASH_ATTENTION:
            return FlashLlama(
                model_id,
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -101,6 +101,13 @@ def load_attention(config, prefix, weights):
                weights=weights,
                bias=False,
            )
+        elif config.model_type == "phi3":
+            return TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.qkv_proj",
+                weights=weights,
+                bias=False,
+            )
        else:
            return TensorParallelColumnLinear.load_multi(
                config,
@ -257,13 +264,21 @@ class LlamaMLP(nn.Module):
            )
        )
        # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
-            weights=weights,
-            dim=0,
-            bias=False,
-        )
+        if config.model_type == "phi3":
+            self.gate_up_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.gate_up_proj",
+                weights=weights,
+                bias=False,
+            )
+        else:
+            self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+                config,
+                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+                weights=weights,
+                dim=0,
+                bias=False,
+            )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",