fix: improve video processing and update unsupported paths

This commit is contained in:
drbh 2025-01-16 17:20:27 +00:00
parent b27749eba7
commit 78cd756caf
3 changed files with 18 additions and 41 deletions

View File

@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
height: _,
frames: _,
})) => {
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
let encoded = STANDARD.encode(data);
output.push_str(&format!(
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
width, mimetype, encoded, mimetype
));
//
// TODO: do not support serialization of video data
unimplemented!("Video tokens are not supported for this model configuration")
}
// We don't create empty chunks, so this should be unreachable.
None => unreachable!("Chunks should never be empty"),

View File

@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
tokenizer_query.push_str(&inputs[start..chunk_start]);
}
let processed_video = match config {
Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
let default_target_width = 224;
let default_target_height = 224;
fetch_video(
&inputs[chunk_start..chunk_end],
default_target_width,
default_target_height,
)?
}
Qwen2Vl(_) => {
let target_width = 360;
let target_height = 420;
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
height: _,
num_frames: _,
}) => {
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
let encoded = STANDARD.encode(data);
output.push_str(&format!(
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
width, mimetype, encoded, mimetype
));
// TODO: do not support serialization of video data
unimplemented!("Video tokens are not supported for this model configuration")
}
});
output

View File

@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
images.append([image])
elif chunk_type == "video":
if config.model_type == "qwen2_vl":
video_frame_buf = np.frombuffer(
chunk.video.data, dtype=np.uint8
)
num_bytes = len(video_frame_buf)
bytes_per_frame = num_bytes // chunk.video.frames
# iterate over with a stride the size of a frame
frames = []
for i in range(chunk.video.frames):
frame = video_frame_buf[
i * bytes_per_frame : (i + 1) * bytes_per_frame
]
frame = frame.reshape(
chunk.video.height, chunk.video.width, 3
)
frames.append(frame)
video_frame_buf = np.stack(frames)
frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
0, 3, 1, 2
# reshape via numpy array then convert to torch tensor and permute
frame_nchw_tensor = torch.from_numpy(
np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
chunk.video.frames,
chunk.video.height,
chunk.video.width,
3,
)
).permute(0, 3, 1, 2)
videos.append(frame_nchw_tensor)
else:
raise RuntimeError(
f"Model type {config.model_type} does not support video"
)
else:
raise RuntimeError(f"Invalid chunk type {chunk_type}")