fix: adjust video process, reduce to 1 fps and adjust tensor shape

2025-07-02 22:10:17 +00:00 · 2024-11-25 16:40:32 -05:00 · 2024-11-25 16:40:32 -05:00 · bc5e202d2c
commit bc5e202d2c
parent 36e095b38d
5 changed files with 46 additions and 24 deletions
--- a/2
+++ b/2
@ -25,7 +25,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
    ffmpeg \
    libavcodec-dev \
    libavfilter-dev \
-    libavdevice-dev \ 
+    libavdevice-dev \
    libavformat-dev \
    libavutil-dev \
    libswscale-dev \
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@ -79,14 +79,19 @@ impl ChunksToString for Vec<InputChunk> {
                let encoded = STANDARD.encode(data);
                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
            }
-            Some(Chunk::Video(video)) => {
-                let encoded = STANDARD.encode(&video.as_bytes());
-                output.push_str(&format!("<video>(data:{};base64,{})", video.mimetype, encoded))
+            Some(Chunk::Video(Video {
+                data,
+                mimetype,
+                width,
+                frames: _,
+            })) => {
+                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
+                let encoded = STANDARD.encode(data);
+                output.push_str(&format!(
+                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
+                    width, mimetype, encoded, mimetype
+                ));
            }
-            // Some(Chunk::Video(Video { data, mimetype })) => {
-            //     let encoded = STANDARD.encode(data);
-            //     output.push_str(&format!("<video>(data:{};base64,{})", mimetype, encoded))
-            // }
            // We don't create empty chunks, so this should be unreachable.
            None => unreachable!("Chunks should never be empty"),
        });
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@ -440,8 +440,10 @@ impl State {
                                    mimetype: image.mimetype,
                                }),
                                Chunk::Video(video) => client::Chunk::Video(client::Video {
-                                    data: video.frames,
+                                    data: video.data,
                                    mimetype: video.mimetype,
+                                    width: video.width,
+                                    frames: video.num_frames,
                                }),
                            }),
                        })
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -65,11 +65,17 @@ message Image {
 }

 message Video {
-  /// Binary video data.
+  /// Binary video data (array of RGB data)
  bytes data = 1;

  /// Video MIME type.
  string mimetype = 2;
+
+  /// Video width
+  uint32 width = 3;
+
+  /// Total number of frames
+  uint32 frames = 4;
 }

 message InputChunk {
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -18,7 +18,6 @@ from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.models.metadata_kernels import block_tables_to_ragged
-from torchvision import io
 import math

 tracer = trace.get_tracer(__name__)
@ -240,7 +239,27 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        images.append([image])
                elif chunk_type == "video":
                    if config.model_type == "qwen2_vl":
-                        videos.append(chunk.video)
+                        video_frame_buf = np.frombuffer(
+                            chunk.video.data, dtype=np.uint8
+                        )
+                        num_bytes = len(video_frame_buf)
+                        bytes_per_frame = num_bytes // chunk.video.frames
+                        height = bytes_per_frame // 3 // chunk.video.width
+
+                        # iterate over with a stride the size of a frame
+                        frames = []
+                        for i in range(chunk.video.frames):
+                            frame = video_frame_buf[
+                                i * bytes_per_frame : (i + 1) * bytes_per_frame
+                            ]
+                            frame = frame.reshape(height, chunk.video.width, 3)
+                            frames.append(frame)
+
+                        video_frame_buf = np.stack(frames)
+                        frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
+                            0, 3, 1, 2
+                        )
+                        videos.append(frame_nchw_tensor)
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")

@ -252,20 +271,10 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
        video_inputs = None
        if videos:
            try:
-                video = videos[0]
-                # Frames are already sampled and resized
-                frames = [
-                    torch.from_numpy(np.frombuffer(frame, dtype=np.uint8).reshape(video.height, video.width, 3))
-                    for frame in video.frames
-                ]
-                video_tensor = torch.stack(frames).permute(0, 3, 1, 2)  # NHWC -> NCHW
-                
-                # Apply any additional preprocessing required by the model
-                tensor_videos = [video_tensor]
                video_inputs = processor.image_processor(
-                    tensor_videos, return_tensors="pt"
+                    videos,
+                    return_tensors="pt",
                )
-
            except Exception as e:
                print(f"Failed to process video: {e}")
                pass