working version

2025-09-18 07:44:53 +00:00 · 2024-12-11 21:08:03 +00:00 · 2024-12-11 21:08:03 +00:00 · 19e1c8da31
commit 19e1c8da31
parent af77a0cadf
4 changed files with 22 additions and 3 deletions
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@ -83,6 +83,7 @@ impl ChunksToString for Vec<InputChunk> {
                data,
                mimetype,
                width,
                height,
                frames: _,
            })) => {
                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@ -443,6 +443,7 @@ impl State {
                                    data: video.data,
                                    mimetype: video.mimetype,
                                    width: video.width,
                                    height: video.height,
                                    frames: video.num_frames,
                                }),
                            }),
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -74,8 +74,11 @@ message Video {
  /// Video width
  uint32 width = 3;
  /// Video height
  uint32 height = 4;
  /// Total number of frames
-  uint32 frames = 4;
+  uint32 frames = 5;
 }
 message InputChunk {
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -244,7 +244,21 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        )
                        num_bytes = len(video_frame_buf)
                        bytes_per_frame = num_bytes // chunk.video.frames
-                        height = bytes_per_frame // 3 // chunk.video.width
+                        #height = bytes_per_frame // 3 // chunk.video.width
                        from loguru import logger
                        log_master(
                            logger.info,
                            f"Video buffer size: {len(video_frame_buf)}",
                        )
                        log_master(
                            logger.info,
                            f"Frames in chunk: {chunk.video.frames}",
                        )
                        log_master(
                            logger.info,
                            f"Bytes per frame: {bytes_per_frame}",
                        )
                        # iterate over with a stride the size of a frame
                        frames = []
@ -252,7 +266,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                            frame = video_frame_buf[
                                i * bytes_per_frame : (i + 1) * bytes_per_frame
                            ]
-                            frame = frame.reshape(height, chunk.video.width, 3)
+                            frame = frame.reshape(chunk.video.height, chunk.video.width, 3)
                            frames.append(frame)
                        video_frame_buf = np.stack(frames)