Fixing OOM on non sharded.

2025-09-10 20:04:52 +00:00 · 2023-07-12 12:46:02 +00:00 · 2023-07-12 12:46:02 +00:00 · f764bc1b52
commit f764bc1b52
parent 6193512c4b
1 changed files with 10 additions and 6 deletions
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -181,6 +181,7 @@ class TensorParallelHead(SuperLayer):
    @staticmethod
    def load(config, prefix: str, weights):
        if weights.process_group.size() > 1:
            try:
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
@ -189,6 +190,9 @@ class TensorParallelHead(SuperLayer):
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
        # GPTQ doesn't quantize heads (nor embeddings)
        if config.quantize == "gptq":