mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 16:02:10 +00:00
Revert pr 235 as flash attention is not really enabled for gemma (#239)
This commit is contained in:
parent
c5e3881051
commit
b126bf4785
@ -694,13 +694,12 @@ class CausalLM(Model):
|
||||
"return_dict": True,
|
||||
}
|
||||
|
||||
if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gemma"]:
|
||||
if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
|
||||
|
||||
if model.config.model_type not in ["falcon"]:
|
||||
kwargs["attn_softmax_bf16"] = True
|
||||
|
||||
if model.config.model_type not in ["gemma"]:
|
||||
kwargs["trim_logits"] = True
|
||||
kwargs["trim_logits"] = True
|
||||
|
||||
if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":
|
||||
kwargs["use_flash_attention"] = True
|
||||
|
Loading…
Reference in New Issue
Block a user