Fixing exl2 and other quanize tests again.

2025-09-12 04:44:52 +00:00 · 2024-08-14 16:30:46 +02:00 · 2024-08-14 16:30:46 +02:00 · f4ce670eb0
commit f4ce670eb0
parent 9aaa12e7ac
4 changed files with 5 additions and 2 deletions
--- a/server/Makefile-exllamav2
+++ b/server/Makefile-exllamav2
@ -1,7 +1,7 @@
-exllamav2_commit := v0.1.8
+exllamav2_commit := 872386c89eaebe0bde5b245a890f1da9522768b3

 build-exllamav2:
-	git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
+	git clone https://github.com/Narsil/exllamav2.git exllamav2 && \
 	cd exllamav2 && git fetch && git checkout $(exllamav2_commit)  && \
 	git submodule update --init --recursive && \
 	pip install -r requirements.txt && \
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -652,6 +652,7 @@ class CausalLM(Model):
            dtype=dtype,
            device=device,
        )
+        self.quantize = quantize
        return self

    @property
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -412,6 +412,7 @@ class Mamba(Model):
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
+        self.quantize = quantize
        self.process_group, _rank, world_size = initialize_torch_distributed()
        if world_size > 1:
            raise RuntimeError("Mamba does not support Tensor Parallelism (TP)")
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@ -676,6 +676,7 @@ class Seq2SeqLM(Model):
            dtype=dtype,
            device=device,
        )
+        self.quantize = quantize
        return self

    @property