mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
force attn to flashdecoding
This commit is contained in:
parent
a7353c35e8
commit
2a10a28d08
@ -158,7 +158,7 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
|
|||||||
prefix_caching = Some("0".to_string());
|
prefix_caching = Some("0".to_string());
|
||||||
}
|
}
|
||||||
match config.model_type.as_deref() {
|
match config.model_type.as_deref() {
|
||||||
Some("falcon") | Some("deepseek_v2") => {
|
Some("falcon") | Some("deepseek_v2") | Some("llama4") => {
|
||||||
// Required because gemma2 needs bfloat16 which is not supported by
|
// Required because gemma2 needs bfloat16 which is not supported by
|
||||||
// flashinfer ?
|
// flashinfer ?
|
||||||
if attention.is_none() {
|
if attention.is_none() {
|
||||||
|
Loading…
Reference in New Issue
Block a user