Fixing the CI.

This commit is contained in:
Nicolas Patry 2025-04-06 10:18:29 +02:00
parent f2be5f3db4
commit 0b1d253c04
No known key found for this signature in database
GPG Key ID: 87B37D879D09DEB4
6 changed files with 18 additions and 30 deletions

View File

@ -193,7 +193,8 @@ RUN cd server && \
pwd && \ pwd && \
text-generation-server --help text-generation-server --help
RUN uv pip install torchvision --no-deps # This shouldn't be necessary.
# RUN uv pip install torchvision --no-deps
# Copy build artifacts from flash attention builder # Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages

View File

@ -155,10 +155,7 @@ fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usiz
let divisor = gcd(h, w); let divisor = gcd(h, w);
let key = (h / divisor, w / divisor); // reduced aspect ratio as key let key = (h / divisor, w / divisor); // reduced aspect ratio as key
if !asp_dict.contains_key(&key) { asp_dict.entry(key).or_default().push((h, w));
asp_dict.insert(key, vec![]);
}
asp_dict.get_mut(&key).unwrap().push((h, w));
} }
} }
@ -176,7 +173,7 @@ fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usiz
fn get_best_fit( fn get_best_fit(
original_height: usize, original_height: usize,
original_width: usize, original_width: usize,
possible_resolutions: &Vec<(usize, usize)>, possible_resolutions: &[(usize, usize)],
resize_to_max_canvas: bool, resize_to_max_canvas: bool,
) -> (usize, usize) { ) -> (usize, usize) {
let orig_h = original_height as f32; let orig_h = original_height as f32;
@ -194,20 +191,13 @@ fn get_best_fit(
let upscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s >= 1.0).collect(); let upscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s >= 1.0).collect();
let selected_scale = if !upscaling_options.is_empty() { let selected_scale = if !upscaling_options.is_empty() {
if resize_to_max_canvas { if resize_to_max_canvas {
upscaling_options upscaling_options.into_iter().fold(f32::MIN, f32::max)
.into_iter()
.fold(f32::MIN, f32::max)
} else { } else {
upscaling_options upscaling_options.into_iter().fold(f32::MAX, f32::min)
.into_iter()
.fold(f32::MAX, f32::min)
} }
} else { } else {
let downscaling_options: Vec<f32> = let downscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s < 1.0).collect();
scales.iter().copied().filter(|&s| s < 1.0).collect(); downscaling_options.into_iter().fold(f32::MIN, f32::max)
downscaling_options
.into_iter()
.fold(f32::MIN, f32::max)
}; };
let chosen_canvas: Vec<(usize, usize)> = possible_resolutions let chosen_canvas: Vec<(usize, usize)> = possible_resolutions
@ -375,7 +365,6 @@ pub struct Gemma3 {
vision_config: Gemma3VisionConfig, vision_config: Gemma3VisionConfig,
} }
#[derive(Clone, Debug, Serialize, Deserialize)] #[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "model_type")] #[serde(tag = "model_type")]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]

View File

@ -207,7 +207,6 @@ pub struct Llama4Processor {
do_image_splitting: bool, do_image_splitting: bool,
} }
#[derive(Debug, Clone, Deserialize, Default)] #[derive(Debug, Clone, Deserialize, Default)]
pub struct HubProcessorConfig { pub struct HubProcessorConfig {
pub chat_template: Option<ChatTemplateVersions>, pub chat_template: Option<ChatTemplateVersions>,

View File

@ -698,7 +698,8 @@ fn image_tokens(
let image_height = config.image_size(); let image_height = config.image_size();
let patch_size = config.patch_size(); let patch_size = config.patch_size();
let pixel_shuffle_ratio = config.pixel_shuffle_ratio(); let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
let downsample_ratio = (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize; let downsample_ratio =
(1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width); let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width);
let image_width = image_height; // Assuming pixel shape: [H][W][C] let image_width = image_height; // Assuming pixel shape: [H][W][C]
@ -726,7 +727,7 @@ fn image_tokens(
img_string.push_str(IMAGE_END); img_string.push_str(IMAGE_END);
img_string img_string
}, }
Qwen2Vl(config) => format!( Qwen2Vl(config) => format!(
"<|vision_start|>{:?}<|vision_end|>", "<|vision_start|>{:?}<|vision_end|>",
"<|image_pad|>".repeat(config.get_number_of_features(height, width)) "<|image_pad|>".repeat(config.get_number_of_features(height, width))
@ -770,8 +771,8 @@ fn prepare_input<T: TokenizerTrait>(
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
let (tokenizer_query, input_chunks) = match config { let (tokenizer_query, input_chunks) = match config {
Some( Some(
config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_) | Paligemma(_) config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_)
| LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)), | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
) => { ) => {
let mut input_chunks = Vec::new(); let mut input_chunks = Vec::new();
let mut tokenizer_query = String::with_capacity(inputs.len()); let mut tokenizer_query = String::with_capacity(inputs.len());

View File

@ -395,7 +395,7 @@ class TransformersFlashVlmCausalLM(VlmCausalLM):
image_grid_thw=image_grid_thw, image_grid_thw=image_grid_thw,
attention_mask=inputs.get("attention_mask", None), attention_mask=inputs.get("attention_mask", None),
use_sdpa=inputs.get("use_sdpa", False), use_sdpa=inputs.get("use_sdpa", False),
cache_position=inputs.get("cache_position", None) cache_position=inputs.get("cache_position", None),
).logits ).logits
logits = self.post_process_outputs(logits, lm_head_indices) logits = self.post_process_outputs(logits, lm_head_indices)
@ -560,9 +560,7 @@ class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM):
class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM): class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM):
def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill): def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
inputs = super().pre_process_inputs( inputs = super().pre_process_inputs(input_ids, position_ids, cu_seqlen_prefill)
input_ids, position_ids, cu_seqlen_prefill
)
inputs["cache_position"] = position_ids inputs["cache_position"] = position_ids
inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device) inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device)
return inputs return inputs