Fix __call__ vs forward.

2025-09-10 20:04:52 +00:00 · 2023-09-07 14:02:34 +00:00 · 2023-09-07 14:02:34 +00:00 · 07bc903d6e
commit 07bc903d6e
parent b03d2621a7
1 changed files with 2 additions and 1 deletions
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/utils/gptq/exllama.py
@ -69,10 +69,11 @@ def create_exllama_buffers():
    TEMP_STATE, TEMP_DQ = temp_state, temp_dq
-class Ex4bitLinear:
+class Ex4bitLinear(torch.nn.Module):
    """Linear layer implementation with per-group 4-bit quantization of the weights"""
    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
        super().__init__()
        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
        assert bits == 4