fix: improve video processing and update unsupported paths

2025-09-17 07:14:53 +00:00 · 2025-01-16 17:20:27 +00:00 · 2025-01-16 17:20:27 +00:00 · 78cd756caf
commit 78cd756caf
parent b27749eba7
3 changed files with 18 additions and 41 deletions
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
                height: _,
                frames: _,
            })) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
-                let encoded = STANDARD.encode(data);
-                output.push_str(&format!(
-                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
-                    width, mimetype, encoded, mimetype
-                ));
+                // 
+                // TODO: do not support serialization of video data
+                unimplemented!("Video tokens are not supported for this model configuration")
            }
            // We don't create empty chunks, so this should be unreachable.
            None => unreachable!("Chunks should never be empty"),
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
                    tokenizer_query.push_str(&inputs[start..chunk_start]);
                }
                let processed_video = match config {
-                    Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
-                        let default_target_width = 224;
-                        let default_target_height = 224;
-                        fetch_video(
-                            &inputs[chunk_start..chunk_end],
-                            default_target_width,
-                            default_target_height,
-                        )?
-                    }
                    Qwen2Vl(_) => {
                        let target_width = 360;
                        let target_height = 420;
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
                height: _,
                num_frames: _,
            }) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
-                let encoded = STANDARD.encode(data);
-                output.push_str(&format!(
-                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
-                    width, mimetype, encoded, mimetype
-                ));
+
+                // TODO: do not support serialization of video data
+                unimplemented!("Video tokens are not supported for this model configuration")
            }
        });
        output
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        images.append([image])
                elif chunk_type == "video":
                    if config.model_type == "qwen2_vl":
-                        video_frame_buf = np.frombuffer(
-                            chunk.video.data, dtype=np.uint8
-                        )
-                        num_bytes = len(video_frame_buf)
-                        bytes_per_frame = num_bytes // chunk.video.frames
-
-                        # iterate over with a stride the size of a frame
-                        frames = []
-                        for i in range(chunk.video.frames):
-                            frame = video_frame_buf[
-                                i * bytes_per_frame : (i + 1) * bytes_per_frame
-                            ]
-                            frame = frame.reshape(
-                                chunk.video.height, chunk.video.width, 3
+                        # reshape via numpy array then convert to torch tensor and permute
+                        frame_nchw_tensor = torch.from_numpy(
+                            np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
+                                chunk.video.frames,
+                                chunk.video.height,
+                                chunk.video.width,
+                                3,
                            )
-                            frames.append(frame)
-
-                        video_frame_buf = np.stack(frames)
-                        frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
-                            0, 3, 1, 2
-                        )
+                        ).permute(0, 3, 1, 2)
                        videos.append(frame_nchw_tensor)
+                    else:
+                        raise RuntimeError(
+                            f"Model type {config.model_type} does not support video"
+                        )
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")