feat(server): have FlashGPTNeoXModel support HF accelerate

It seems to work fine and loads 4-10x faster for me depending on the storage/page cache (non-sharded 20B parameter model). However when loaded this way inference appears to be 10-15% slower for some reason.
2025-09-10 03:44:54 +00:00 · 2023-04-17 16:29:14 -07:00 · 2023-04-17 16:29:14 -07:00 · 252a086e9b
commit 252a086e9b
parent b927244eb5
1 changed files with 1 additions and 1 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -505,7 +505,7 @@ class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
    config_class = GPTNeoXConfig
    base_model_prefix = "gpt_neox"
    supports_gradient_checkpointing = False
-    _no_split_modules = None
+    _no_split_modules = ["FlashNeoXLayer"]
 class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):