working version

This commit is contained in:
Miquel Farre 2024-12-11 21:08:03 +00:00 committed by drbh
parent af77a0cadf
commit 19e1c8da31
4 changed files with 22 additions and 3 deletions

View File

@ -83,6 +83,7 @@ impl ChunksToString for Vec<InputChunk> {
data, data,
mimetype, mimetype,
width, width,
height,
frames: _, frames: _,
})) => { })) => {
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings

View File

@ -443,6 +443,7 @@ impl State {
data: video.data, data: video.data,
mimetype: video.mimetype, mimetype: video.mimetype,
width: video.width, width: video.width,
height: video.height,
frames: video.num_frames, frames: video.num_frames,
}), }),
}), }),

View File

@ -74,8 +74,11 @@ message Video {
/// Video width /// Video width
uint32 width = 3; uint32 width = 3;
/// Video height
uint32 height = 4;
/// Total number of frames /// Total number of frames
uint32 frames = 4; uint32 frames = 5;
} }
message InputChunk { message InputChunk {

View File

@ -244,7 +244,21 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
) )
num_bytes = len(video_frame_buf) num_bytes = len(video_frame_buf)
bytes_per_frame = num_bytes // chunk.video.frames bytes_per_frame = num_bytes // chunk.video.frames
height = bytes_per_frame // 3 // chunk.video.width #height = bytes_per_frame // 3 // chunk.video.width
from loguru import logger
log_master(
logger.info,
f"Video buffer size: {len(video_frame_buf)}",
)
log_master(
logger.info,
f"Frames in chunk: {chunk.video.frames}",
)
log_master(
logger.info,
f"Bytes per frame: {bytes_per_frame}",
)
# iterate over with a stride the size of a frame # iterate over with a stride the size of a frame
frames = [] frames = []
@ -252,7 +266,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
frame = video_frame_buf[ frame = video_frame_buf[
i * bytes_per_frame : (i + 1) * bytes_per_frame i * bytes_per_frame : (i + 1) * bytes_per_frame
] ]
frame = frame.reshape(height, chunk.video.width, 3) frame = frame.reshape(chunk.video.height, chunk.video.width, 3)
frames.append(frame) frames.append(frame)
video_frame_buf = np.stack(frames) video_frame_buf = np.stack(frames)