fix: improve video processing and update unsupported paths

This commit is contained in:
drbh 2025-01-16 17:20:27 +00:00
parent b27749eba7
commit 78cd756caf
3 changed files with 18 additions and 41 deletions

View File

@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
height: _, height: _,
frames: _, frames: _,
})) => { })) => {
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings //
let encoded = STANDARD.encode(data); // TODO: do not support serialization of video data
output.push_str(&format!( unimplemented!("Video tokens are not supported for this model configuration")
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
width, mimetype, encoded, mimetype
));
} }
// We don't create empty chunks, so this should be unreachable. // We don't create empty chunks, so this should be unreachable.
None => unreachable!("Chunks should never be empty"), None => unreachable!("Chunks should never be empty"),

View File

@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
tokenizer_query.push_str(&inputs[start..chunk_start]); tokenizer_query.push_str(&inputs[start..chunk_start]);
} }
let processed_video = match config { let processed_video = match config {
Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
let default_target_width = 224;
let default_target_height = 224;
fetch_video(
&inputs[chunk_start..chunk_end],
default_target_width,
default_target_height,
)?
}
Qwen2Vl(_) => { Qwen2Vl(_) => {
let target_width = 360; let target_width = 360;
let target_height = 420; let target_height = 420;
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
height: _, height: _,
num_frames: _, num_frames: _,
}) => { }) => {
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
let encoded = STANDARD.encode(data); // TODO: do not support serialization of video data
output.push_str(&format!( unimplemented!("Video tokens are not supported for this model configuration")
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
width, mimetype, encoded, mimetype
));
} }
}); });
output output

View File

@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
images.append([image]) images.append([image])
elif chunk_type == "video": elif chunk_type == "video":
if config.model_type == "qwen2_vl": if config.model_type == "qwen2_vl":
video_frame_buf = np.frombuffer( # reshape via numpy array then convert to torch tensor and permute
chunk.video.data, dtype=np.uint8 frame_nchw_tensor = torch.from_numpy(
) np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
num_bytes = len(video_frame_buf) chunk.video.frames,
bytes_per_frame = num_bytes // chunk.video.frames chunk.video.height,
chunk.video.width,
# iterate over with a stride the size of a frame 3,
frames = []
for i in range(chunk.video.frames):
frame = video_frame_buf[
i * bytes_per_frame : (i + 1) * bytes_per_frame
]
frame = frame.reshape(
chunk.video.height, chunk.video.width, 3
)
frames.append(frame)
video_frame_buf = np.stack(frames)
frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
0, 3, 1, 2
) )
).permute(0, 3, 1, 2)
videos.append(frame_nchw_tensor) videos.append(frame_nchw_tensor)
else:
raise RuntimeError(
f"Model type {config.model_type} does not support video"
)
else: else:
raise RuntimeError(f"Invalid chunk type {chunk_type}") raise RuntimeError(f"Invalid chunk type {chunk_type}")