mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 03:44:54 +00:00
feat(server): have FlashGPTNeoXModel support HF accelerate
It seems to work fine and loads 4-10x faster for me depending on the storage/page cache (non-sharded 20B parameter model). However when loaded this way inference appears to be 10-15% slower for some reason.
This commit is contained in:
parent
b927244eb5
commit
252a086e9b
@ -505,7 +505,7 @@ class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
|
|||||||
config_class = GPTNeoXConfig
|
config_class = GPTNeoXConfig
|
||||||
base_model_prefix = "gpt_neox"
|
base_model_prefix = "gpt_neox"
|
||||||
supports_gradient_checkpointing = False
|
supports_gradient_checkpointing = False
|
||||||
_no_split_modules = None
|
_no_split_modules = ["FlashNeoXLayer"]
|
||||||
|
|
||||||
|
|
||||||
class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
|
class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
|
||||||
|
Loading…
Reference in New Issue
Block a user