Fix HF_HUB_OFFLINE=1 for Gaudi backend (#3193)

* Fix `HF_HUB_OFFLINE=1` for Gaudi backend * Fix HF cache default value in server.rs * Format
2025-10-19 03:45:22 +00:00 · 2025-05-06 02:47:53 -06:00 · 2025-05-06 02:47:53 -06:00 · f208ba6afc
commit f208ba6afc
parent 7253be349a
4 changed files with 33 additions and 20 deletions
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -8,7 +8,7 @@ PYTORCH_VERSION := 2.6.0
 .PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install

 image:
-	docker build -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)
+	docker build --ulimit nofile=4096 -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)

 run-local-dev-container:
 		docker run -it \
--- a/backends/gaudi/server/text_generation_server/models/causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@ -4,6 +4,7 @@ import bisect
 from dataclasses import dataclass
 from functools import wraps
 import itertools
+import json
 import math
 import os
 import tempfile
@ -17,15 +18,12 @@ from loguru import logger
 from opentelemetry import trace

 import text_generation_server.habana_quantization_env as hq_env
+from text_generation_server.utils import weight_files
 import habana_frameworks.torch as htorch
 from optimum.habana.utils import HabanaProfile
 from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
 from text_generation_server.utils.chunks import concat_text_chunks
-from optimum.habana.checkpoint_utils import (
-    get_repo_root,
-    model_on_meta,
-    write_checkpoints_json,
-)
+from optimum.habana.checkpoint_utils import model_on_meta
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
@ -708,6 +706,9 @@ class CausalLM(Model):
        if hq_env.is_quantization_enabled:
            htorch.core.hpu_set_env()

+        # Get weight files
+        weight_files(model_id, revision=revision, extension=".safetensors")
+
        if world_size > 1:
            os.environ.setdefault(
                "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
@ -715,8 +716,6 @@ class CausalLM(Model):
            model = self.get_deepspeed_model(model_id, dtype, revision)
            model = hq_env.prepare_model_for_quantization(model)
        else:
-            get_repo_root(model_id)
-
            # Check support for rope scaling
            model_kwargs = {}
            config = AutoConfig.from_pretrained(model_id)
@ -868,7 +867,6 @@ class CausalLM(Model):
            with deepspeed.OnDevice(dtype=dtype, device="meta"):
                model = AutoModelForCausalLM.from_config(config, torch_dtype=dtype)
        else:
-            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
            # TODO: revisit placement on CPU when auto-injection is possible
            with deepspeed.OnDevice(dtype=dtype, device="cpu"):
                model = AutoModelForCausalLM.from_pretrained(
@ -884,7 +882,16 @@ class CausalLM(Model):
        if load_to_meta:
            # model loaded to meta is managed differently
            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            checkpoint_files = [
+                str(f)
+                for f in weight_files(
+                    model_id, revision=revision, extension=".safetensors"
+                )
+            ]
+            data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+            json.dump(data, checkpoints_json)
+            checkpoints_json.flush()
+
            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
        model = deepspeed.init_inference(model, **ds_inference_kwargs)

--- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@ -1,3 +1,4 @@
+import json
 import re
 import torch
 import os
@ -12,6 +13,7 @@ import tempfile
 import copy
 from text_generation_server.models import Model
 from transformers import PreTrainedTokenizerBase
+from text_generation_server.utils import weight_files
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.causal_lm import (
@ -43,11 +45,7 @@ from transformers import (
    AutoTokenizer,
    AutoConfig,
 )
-from optimum.habana.checkpoint_utils import (
-    get_repo_root,
-    model_on_meta,
-    write_checkpoints_json,
-)
+from optimum.habana.checkpoint_utils import model_on_meta

 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.models.types import (
@ -840,6 +838,9 @@ class VlmCausalLM(Model):
        if hq_env.is_quantization_enabled:
            htorch.core.hpu_set_env()

+        # Get weight files
+        weight_files(model_id, revision=revision, extension=".safetensors")
+
        if world_size > 1:
            os.environ.setdefault(
                "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
@ -847,8 +848,6 @@ class VlmCausalLM(Model):
            model = self.get_deepspeed_model(model_class, model_id, dtype, revision)
            model = hq_env.prepare_model_for_quantization(model)
        else:
-            get_repo_root(model_id)
-
            # Check support for rope scaling
            model_kwargs = {}
            config = AutoConfig.from_pretrained(model_id)
@ -1000,7 +999,6 @@ class VlmCausalLM(Model):
            with deepspeed.OnDevice(dtype=dtype, device="meta"):
                model = model_class.from_config(config, torch_dtype=dtype)
        else:
-            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
            # TODO: revisit placement on CPU when auto-injection is possible
            with deepspeed.OnDevice(dtype=dtype, device="cpu"):
                model = model_class.from_pretrained(
@ -1019,7 +1017,15 @@ class VlmCausalLM(Model):
        if load_to_meta:
            # model loaded to meta is managed differently
            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            checkpoint_files = [
+                str(f)
+                for f in weight_files(
+                    model_id, revision=revision, extension=".safetensors"
+                )
+            ]
+            data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+            json.dump(data, checkpoints_json)
+            checkpoints_json.flush()
            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
        model = deepspeed.init_inference(model, **ds_inference_kwargs)

--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1578,7 +1578,7 @@ pub async fn run(
            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
                .map_err(|_| ())
                .map(|cache_dir| Cache::new(cache_dir.into()))
-                .unwrap_or_else(|_| Cache::default());
+                .unwrap_or_else(|_| Cache::from_env());
            tracing::warn!("Offline mode active using cache defaults");
            Type::Cache(cache)
        } else {