fix: improve video processing and update unsupported paths

2025-07-13 03:10:17 +00:00 · 2025-01-16 17:20:27 +00:00 · 2025-01-16 17:20:27 +00:00 · 78cd756caf
commit 78cd756caf
parent b27749eba7
3 changed files with 18 additions and 41 deletions
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
                height: _,
                frames: _,
            })) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
+                // 
-                let encoded = STANDARD.encode(data);
+                // TODO: do not support serialization of video data
-                output.push_str(&format!(
+                unimplemented!("Video tokens are not supported for this model configuration")
                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
                    width, mimetype, encoded, mimetype
                ));
            }
            // We don't create empty chunks, so this should be unreachable.
            None => unreachable!("Chunks should never be empty"),
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
                    tokenizer_query.push_str(&inputs[start..chunk_start]);
                }
                let processed_video = match config {
                    Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
                        let default_target_width = 224;
                        let default_target_height = 224;
                        fetch_video(
                            &inputs[chunk_start..chunk_end],
                            default_target_width,
                            default_target_height,
                        )?
                    }
                    Qwen2Vl(_) => {
                        let target_width = 360;
                        let target_height = 420;
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
                height: _,
                num_frames: _,
            }) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
+
-                let encoded = STANDARD.encode(data);
+                // TODO: do not support serialization of video data
-                output.push_str(&format!(
+                unimplemented!("Video tokens are not supported for this model configuration")
                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
                    width, mimetype, encoded, mimetype
                ));
            }
        });
        output
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        images.append([image])
                elif chunk_type == "video":
                    if config.model_type == "qwen2_vl":
-                        video_frame_buf = np.frombuffer(
+                        # reshape via numpy array then convert to torch tensor and permute
-                            chunk.video.data, dtype=np.uint8
+                        frame_nchw_tensor = torch.from_numpy(
-                        )
+                            np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
-                        num_bytes = len(video_frame_buf)
+                                chunk.video.frames,
-                        bytes_per_frame = num_bytes // chunk.video.frames
+                                chunk.video.height,
-
+                                chunk.video.width,
-                        # iterate over with a stride the size of a frame
+                                3,
                        frames = []
                        for i in range(chunk.video.frames):
                            frame = video_frame_buf[
                                i * bytes_per_frame : (i + 1) * bytes_per_frame
                            ]
                            frame = frame.reshape(
                                chunk.video.height, chunk.video.width, 3
                            )
                            frames.append(frame)
                        video_frame_buf = np.stack(frames)
                        frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
                            0, 3, 1, 2
                            )
                        ).permute(0, 3, 1, 2)
                        videos.append(frame_nchw_tensor)
                    else:
                        raise RuntimeError(
                            f"Model type {config.model_type} does not support video"
                        )
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")