hotfix for quantization

2025-09-11 20:34:54 +00:00 · 2024-05-17 17:18:40 +00:00 · 2024-05-17 17:18:40 +00:00 · 7a5f5d9757
commit 7a5f5d9757
parent f82ae76dff
2 changed files with 8 additions and 0 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -230,11 +230,15 @@ class LlamaMLP(nn.Module):
            config.intermediate_size // weights.process_group.size()
        )

+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
    def forward(self, hidden_states):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.shape[0] == 1
+            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -290,11 +290,15 @@ class MistralMLP(nn.Module):
            config.intermediate_size // weights.process_group.size()
        )

+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
    def forward(self, hidden_states):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.shape[0] == 1
+            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],