Fixing OOM on non sharded.

2025-09-10 20:04:52 +00:00 · 2023-07-12 12:46:02 +00:00 · 2023-07-12 12:46:02 +00:00 · f764bc1b52
commit f764bc1b52
parent 6193512c4b
1 changed files with 10 additions and 6 deletions
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -181,12 +181,16 @@ class TensorParallelHead(SuperLayer):

    @staticmethod
    def load(config, prefix: str, weights):
-        try:
-            weight = weights.get_sharded(f"{prefix}.weight", dim=0)
-            should_gather = True
-        except AssertionError:
-            # If the vocab size is not divisible by number of shards
-            # just load the entire thing.
+        if weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False