force attn to flashdecoding

This commit is contained in:
Mohit Sharma 2025-04-11 15:24:12 +00:00
parent a7353c35e8
commit 2a10a28d08

View File

@ -158,7 +158,7 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
prefix_caching = Some("0".to_string()); prefix_caching = Some("0".to_string());
} }
match config.model_type.as_deref() { match config.model_type.as_deref() {
Some("falcon") | Some("deepseek_v2") => { Some("falcon") | Some("deepseek_v2") | Some("llama4") => {
// Required because gemma2 needs bfloat16 which is not supported by // Required because gemma2 needs bfloat16 which is not supported by
// flashinfer ? // flashinfer ?
if attention.is_none() { if attention.is_none() {