mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-08-01 04:40:17 +00:00
fix: limit vision flop calc to qwen2 vl models and update config typing
This commit is contained in:
parent
d12e075966
commit
a0ab962b6d
@ -231,12 +231,12 @@ struct QuantizationConfig {
|
|||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
struct VisionConfig {
|
struct VisionConfig {
|
||||||
depth: usize,
|
depth: Option<usize>,
|
||||||
embed_dim: usize,
|
embed_dim: Option<usize>,
|
||||||
mlp_ratio: usize,
|
mlp_ratio: Option<usize>,
|
||||||
in_chans: usize,
|
in_chans: Option<usize>,
|
||||||
patch_size: usize,
|
patch_size: Option<usize>,
|
||||||
temporal_patch_size: usize,
|
temporal_patch_size: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@ -283,33 +283,45 @@ impl Config {
|
|||||||
|
|
||||||
tracing::debug!("Text flops: {}", human_size(text_flops as usize, "flop"));
|
tracing::debug!("Text flops: {}", human_size(text_flops as usize, "flop"));
|
||||||
|
|
||||||
if let Some(vision_config) = self.vision_config.as_ref() {
|
// text-only case
|
||||||
let in_chans = vision_config.in_chans as u64;
|
if self.vision_config.is_none() {
|
||||||
let patch_size = vision_config.patch_size as u64;
|
return Some(text_flops);
|
||||||
let embed_dim = vision_config.embed_dim as u64;
|
}
|
||||||
let vision_depth = vision_config.depth as u64;
|
|
||||||
let mlp_ratio = vision_config.mlp_ratio as u64;
|
let vision_config = self.vision_config.as_ref().unwrap();
|
||||||
let temporal_patch_size = vision_config.temporal_patch_size as u64;
|
|
||||||
// 1. patch embedding:
|
// estimate vision flops for specific model types
|
||||||
// - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2
|
match self.model_type.as_deref() {
|
||||||
// where the 2 accounts for multiply-add
|
Some("qwen2_vl") => {
|
||||||
let patch_flops = 2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans;
|
let in_chans = vision_config.in_chans? as u64;
|
||||||
// 2. self-attention + mlp:
|
let patch_size = vision_config.patch_size? as u64;
|
||||||
// - qkv projections: 3 * d_model * d_model * 2
|
let embed_dim = vision_config.embed_dim? as u64;
|
||||||
// - attention: d_model * d_model * 2
|
let vision_depth = vision_config.depth? as u64;
|
||||||
// - mlp: 2 * d_model * (mlp_ratio * d_model) * 2
|
let mlp_ratio = vision_config.mlp_ratio? as u64;
|
||||||
// simplified to: 2 * d_model * (4 + mlp_ratio * d_model)
|
let temporal_patch_size = vision_config.temporal_patch_size? as u64;
|
||||||
let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim);
|
// 1. patch embedding:
|
||||||
// 3. add with layer norm flops for total vision layer flops
|
// - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2
|
||||||
let layer_flops = patch_flops + attn_flops + 2 * embed_dim;
|
// where the 2 accounts for multiply-add
|
||||||
let vision_flops = layer_flops * vision_depth;
|
let patch_flops =
|
||||||
tracing::debug!(
|
2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans;
|
||||||
"Vision flops: {}",
|
// 2. self-attention + mlp:
|
||||||
human_size(vision_flops as usize, "flop")
|
// - qkv projections: 3 * d_model * d_model * 2
|
||||||
);
|
// - attention: d_model * d_model * 2
|
||||||
Some(text_flops + vision_flops)
|
// - mlp: 2 * d_model * (mlp_ratio * d_model) * 2
|
||||||
} else {
|
// simplified to: 2 * d_model * (4 + mlp_ratio * d_model)
|
||||||
Some(text_flops)
|
let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim);
|
||||||
|
// 3. add with layer norm flops for total vision layer flops
|
||||||
|
let layer_flops = patch_flops + attn_flops + 2 * embed_dim;
|
||||||
|
let vision_flops = layer_flops * vision_depth;
|
||||||
|
tracing::debug!(
|
||||||
|
"Vision flops: {}",
|
||||||
|
human_size(vision_flops as usize, "flop")
|
||||||
|
);
|
||||||
|
Some(text_flops + vision_flops)
|
||||||
|
}
|
||||||
|
// model has a vision config but is not supported for flops calculation
|
||||||
|
// we return None to avoid overestimating the memory requirements
|
||||||
|
_ => return None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,15 +86,21 @@ class PositionRotaryEmbedding(nn.Module):
|
|||||||
# `rope_type` is now standard in transformers, but some existing models
|
# `rope_type` is now standard in transformers, but some existing models
|
||||||
# have `type` instead.
|
# have `type` instead.
|
||||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
|
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
|
||||||
|
mrope_section = rope_scaling.get("mrope_section", None)
|
||||||
|
|
||||||
|
# only apply mrope if sections are provided and the rope type is mrope or default
|
||||||
|
if mrope_section is not None and (
|
||||||
|
rope_type == "mrope" or rope_type == "default"
|
||||||
|
):
|
||||||
|
mrope_section = rope_scaling.get("mrope_section")
|
||||||
|
return RotaryPositionEmbeddingMultimodalSections(
|
||||||
|
inv_freq, scaling_factor, mrope_section
|
||||||
|
)
|
||||||
|
|
||||||
if rope_type == "linear":
|
if rope_type == "linear":
|
||||||
pass
|
pass
|
||||||
elif rope_type == "default":
|
elif rope_type == "default":
|
||||||
if rope_scaling.get("mrope_section", False):
|
pass
|
||||||
mrope_section = rope_scaling.get("mrope_section")
|
|
||||||
return RotaryPositionEmbeddingMultimodalSections(
|
|
||||||
inv_freq, scaling_factor, mrope_section
|
|
||||||
)
|
|
||||||
elif rope_type == "dynamic":
|
elif rope_type == "dynamic":
|
||||||
scaling_factor = rope_scaling["factor"]
|
scaling_factor = rope_scaling["factor"]
|
||||||
return DynamicPositionRotaryEmbedding(
|
return DynamicPositionRotaryEmbedding(
|
||||||
|
Loading…
Reference in New Issue
Block a user