Hotfixing qwen2 and starcoder2 (which also get clamping).

2025-09-11 20:34:54 +00:00 · 2024-07-02 14:26:16 +02:00 · 2024-07-02 14:26:16 +02:00 · 57541d5e88
commit 57541d5e88
parent 963b6c6f0f
2 changed files with 2 additions and 2 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -368,7 +368,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
        elif self.max_past is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)

        hidden_states = self.model(
            input_ids,
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@ -534,7 +534,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
        elif self.max_past is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)

        hidden_states = self.model(
            input_ids,