fix(server): fix quantization

2025-07-05 15:30:19 +00:00 · 2023-05-30 13:56:03 +02:00 · 2023-05-30 13:56:03 +02:00 · bf7f1d5434
commit bf7f1d5434
parent 49a6c8c1b2
4 changed files with 24 additions and 32 deletions
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -246,9 +246,7 @@ class BLOOMSharded(BLOOM):
                            module.linear = replace_linear(state)
                    elif quantize == "gptq":
-                            raise NotImplementedError(
+                        raise NotImplementedError("`gptq` is not implemented for now")
                                "`gptq` is not implemented for now"
                            )
                    elif quantize is None:
                        tensor = tensor.to(device)
                    else:
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@ -365,9 +365,7 @@ class GalacticaSharded(Galactica):
                            module.linear = replace_linear(state)
                    elif quantize == "gptq":
-                            raise NotImplementedError(
+                        raise NotImplementedError("`gptq` is not implemented for now")
                                "`gptq` is not implemented for now"
                            )
                    elif quantize is None:
                        tensor = tensor.to(device)
                    else:
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@ -211,9 +211,7 @@ class GPTNeoxSharded(CausalLM):
                            module.linear = replace_linear(state)
                    elif quantize == "gptq":
-                            raise NotImplementedError(
+                        raise NotImplementedError("`gptq` is not implemented for now")
                                "`gptq` is not implemented for now"
                            )
                    elif quantize is None:
                        tensor = tensor.to(device)
                    else:
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@ -224,10 +224,8 @@ class T5Sharded(Seq2SeqLM):
                            module.linear = replace_linear(state)
                    elif quantize == "gptq" and not module_name.endswith("wo"):
-                            raise NotImplementedError(
+                        raise NotImplementedError("`gptq` is not implemented for now")
-                                "`gptq` is not implemented for now"
+                    elif quantize is None or module_name.endswith("wo"):
                            )
                        elif quantize is None:
                        tensor = tensor.to(device)
                    else:
                        raise ValueError(f"Unexpected quantize `{quantize}`")