diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs index 55297c92c..7fd010ae7 100644 --- a/backends/client/src/lib.rs +++ b/backends/client/src/lib.rs @@ -83,6 +83,7 @@ impl ChunksToString for Vec { data, mimetype, width, + height, frames: _, })) => { // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 76b92b487..a5dd6cb51 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -443,6 +443,7 @@ impl State { data: video.data, mimetype: video.mimetype, width: video.width, + height: video.height, frames: video.num_frames, }), }), diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto index 3ce4a9dc3..0d707ee92 100644 --- a/proto/v3/generate.proto +++ b/proto/v3/generate.proto @@ -74,8 +74,11 @@ message Video { /// Video width uint32 width = 3; + /// Video height + uint32 height = 4; + /// Total number of frames - uint32 frames = 4; + uint32 frames = 5; } message InputChunk { diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 9ca4a1437..260f780f2 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -244,15 +244,29 @@ class VlmCausalLMBatch(FlashCausalLMBatch): ) num_bytes = len(video_frame_buf) bytes_per_frame = num_bytes // chunk.video.frames - height = bytes_per_frame // 3 // chunk.video.width + #height = bytes_per_frame // 3 // chunk.video.width + from loguru import logger + log_master( + logger.info, + f"Video buffer size: {len(video_frame_buf)}", + ) + log_master( + logger.info, + f"Frames in chunk: {chunk.video.frames}", + ) + log_master( + logger.info, + f"Bytes per frame: {bytes_per_frame}", + ) + # iterate over with a stride the size of a frame frames = [] for i in range(chunk.video.frames): frame = video_frame_buf[ i * bytes_per_frame : (i + 1) * bytes_per_frame ] - frame = frame.reshape(height, chunk.video.width, 3) + frame = frame.reshape(chunk.video.height, chunk.video.width, 3) frames.append(frame) video_frame_buf = np.stack(frames)