Add support for Llama 3 rotary embeddings

2025-09-12 04:44:52 +00:00 · 2024-07-23 14:34:56 +00:00 · 2024-07-23 14:34:56 +00:00 · 2c3b078911
commit 2c3b078911
parent 9935720c87
1 changed files with 52 additions and 8 deletions
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@ -1,4 +1,5 @@
 import os
+import math
 import torch
 from torch import nn
 from loguru import logger
@ -85,9 +86,13 @@ class PositionRotaryEmbedding(nn.Module):
        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
-            if rope_scaling["type"] == "linear":
+            # `rope_type` is now standard in transformers, but some existing models
+            # have `type` instead.
+            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+
+            if rope_type == "linear":
                pass
-            elif rope_scaling["type"] == "dynamic":
+            elif rope_type == "dynamic":
                scaling_factor = rope_scaling["factor"]
                return DynamicPositionRotaryEmbedding(
                    dim=dim,
@ -96,7 +101,20 @@ class PositionRotaryEmbedding(nn.Module):
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
-            elif rope_scaling["type"] == "yarn":
+            elif rope_type == "llama3":
+                inv_freq = apply_llama3_scaling(
+                    inv_freq,
+                    scaling_factor=rope_scaling["factor"],
+                    low_freq_factor=rope_scaling["low_freq_factor"],
+                    high_freq_factor=rope_scaling["high_freq_factor"],
+                    original_max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                )
+
+                return cls(inv_freq, scaling_factor)
+
+            elif rope_type == "yarn":
                scaling_factor = rope_scaling["factor"]
                mscale = rope_scaling.get("mscale", 1.0)
                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
@ -115,7 +133,7 @@ class PositionRotaryEmbedding(nn.Module):
                    mscale=mscale,
                    mscale_all_dim=mscale_all_dim,
                )
-            elif rope_scaling["type"] in ["su", "longrope"]:
+            elif rope_type in ["su", "longrope"]:
                short_factor = torch.tensor(
                    rope_scaling["short_factor"], dtype=torch.float32, device=device
                )
@ -327,10 +345,6 @@ class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
            self._sin_cached = torch.sin(freqs).to(dtype)


-# Inverse dim formula to find dim based on number of rotations
-import math
-
-
 def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
@ -434,3 +448,33 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
+
+
+def apply_llama3_scaling(
+    freqs: torch.Tensor,
+    *,
+    scaling_factor: int,
+    low_freq_factor: int,
+    high_freq_factor: int,
+    original_max_position_embeddings: int,
+):
+    low_freq_wavelen = original_max_position_embeddings / low_freq_factor
+    high_freq_wavelen = original_max_position_embeddings / high_freq_factor
+    new_freqs = []
+
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scaling_factor)
+        else:
+
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
+                high_freq_factor - low_freq_factor
+            )
+            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
+
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)