From a0ab962b6d7a804ad466298af69d3402e7536ed9 Mon Sep 17 00:00:00 2001 From: drbh Date: Wed, 22 Jan 2025 18:30:03 +0000 Subject: [PATCH] fix: limit vision flop calc to qwen2 vl models and update config typing --- launcher/src/main.rs | 78 +++++++++++-------- .../text_generation_server/layers/rotary.py | 16 ++-- 2 files changed, 56 insertions(+), 38 deletions(-) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 8e93b1b2..6cbdb1d6 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -231,12 +231,12 @@ struct QuantizationConfig { #[derive(Debug, Deserialize)] struct VisionConfig { - depth: usize, - embed_dim: usize, - mlp_ratio: usize, - in_chans: usize, - patch_size: usize, - temporal_patch_size: usize, + depth: Option, + embed_dim: Option, + mlp_ratio: Option, + in_chans: Option, + patch_size: Option, + temporal_patch_size: Option, } #[derive(Debug, Deserialize)] @@ -283,33 +283,45 @@ impl Config { tracing::debug!("Text flops: {}", human_size(text_flops as usize, "flop")); - if let Some(vision_config) = self.vision_config.as_ref() { - let in_chans = vision_config.in_chans as u64; - let patch_size = vision_config.patch_size as u64; - let embed_dim = vision_config.embed_dim as u64; - let vision_depth = vision_config.depth as u64; - let mlp_ratio = vision_config.mlp_ratio as u64; - let temporal_patch_size = vision_config.temporal_patch_size as u64; - // 1. patch embedding: - // - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2 - // where the 2 accounts for multiply-add - let patch_flops = 2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans; - // 2. self-attention + mlp: - // - qkv projections: 3 * d_model * d_model * 2 - // - attention: d_model * d_model * 2 - // - mlp: 2 * d_model * (mlp_ratio * d_model) * 2 - // simplified to: 2 * d_model * (4 + mlp_ratio * d_model) - let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim); - // 3. add with layer norm flops for total vision layer flops - let layer_flops = patch_flops + attn_flops + 2 * embed_dim; - let vision_flops = layer_flops * vision_depth; - tracing::debug!( - "Vision flops: {}", - human_size(vision_flops as usize, "flop") - ); - Some(text_flops + vision_flops) - } else { - Some(text_flops) + // text-only case + if self.vision_config.is_none() { + return Some(text_flops); + } + + let vision_config = self.vision_config.as_ref().unwrap(); + + // estimate vision flops for specific model types + match self.model_type.as_deref() { + Some("qwen2_vl") => { + let in_chans = vision_config.in_chans? as u64; + let patch_size = vision_config.patch_size? as u64; + let embed_dim = vision_config.embed_dim? as u64; + let vision_depth = vision_config.depth? as u64; + let mlp_ratio = vision_config.mlp_ratio? as u64; + let temporal_patch_size = vision_config.temporal_patch_size? as u64; + // 1. patch embedding: + // - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2 + // where the 2 accounts for multiply-add + let patch_flops = + 2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans; + // 2. self-attention + mlp: + // - qkv projections: 3 * d_model * d_model * 2 + // - attention: d_model * d_model * 2 + // - mlp: 2 * d_model * (mlp_ratio * d_model) * 2 + // simplified to: 2 * d_model * (4 + mlp_ratio * d_model) + let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim); + // 3. add with layer norm flops for total vision layer flops + let layer_flops = patch_flops + attn_flops + 2 * embed_dim; + let vision_flops = layer_flops * vision_depth; + tracing::debug!( + "Vision flops: {}", + human_size(vision_flops as usize, "flop") + ); + Some(text_flops + vision_flops) + } + // model has a vision config but is not supported for flops calculation + // we return None to avoid overestimating the memory requirements + _ => return None, } } diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 061bf024..9f1770ff 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -86,15 +86,21 @@ class PositionRotaryEmbedding(nn.Module): # `rope_type` is now standard in transformers, but some existing models # have `type` instead. rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) + mrope_section = rope_scaling.get("mrope_section", None) + + # only apply mrope if sections are provided and the rope type is mrope or default + if mrope_section is not None and ( + rope_type == "mrope" or rope_type == "default" + ): + mrope_section = rope_scaling.get("mrope_section") + return RotaryPositionEmbeddingMultimodalSections( + inv_freq, scaling_factor, mrope_section + ) if rope_type == "linear": pass elif rope_type == "default": - if rope_scaling.get("mrope_section", False): - mrope_section = rope_scaling.get("mrope_section") - return RotaryPositionEmbeddingMultimodalSections( - inv_freq, scaling_factor, mrope_section - ) + pass elif rope_type == "dynamic": scaling_factor = rope_scaling["factor"] return DynamicPositionRotaryEmbedding(