Tiny fixes for falcon.

2025-07-10 18:00:16 +00:00 · 2023-06-14 09:29:44 +02:00 · 2023-06-14 09:29:44 +02:00 · 55cf4d257c
commit 55cf4d257c
parent e5e552b496
2 changed files with 7 additions and 4 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -21,7 +21,8 @@ from text_generation_server.utils.layers import (


 def load_row(config, prefix: str, weights, bias: bool):
-    weight = weights.get_sharded(f"{prefix}.weight", dim=1)
+    weight = weights.get_multi_weights_col([prefix], quantize=config.quantize, dim=1)
+
    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
--- a/server/text_generation_server/utils/gptq/quantize.py
+++ b/server/text_generation_server/utils/gptq/quantize.py
@ -205,7 +205,7 @@ class GPTQ:

    def print_loss(self, name, q_weight, weight_error, timecost):
        table = Texttable()
-        length = 30
+        length = 28
        name = (
            (name + " " * (length - len(name)))
            if len(name) <= length
@ -1165,10 +1165,12 @@ def quantize(
            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
            f"index located at {save_index_file}."
        )
-    config = AutoConfig.from_pretrained(model_id)
+    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
    config.save_pretrained(output_dir)
    logger.info("Saved config")
    logger.info("Saving tokenizer")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id, trust_remote_code=trust_remote_code
+    )
    tokenizer.save_pretrained(output_dir)
    logger.info("Saved tokenizer")