diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs index cec820f6..9ced5a83 100644 --- a/backends/client/src/lib.rs +++ b/backends/client/src/lib.rs @@ -86,12 +86,9 @@ impl ChunksToString for Vec { height: _, frames: _, })) => { - // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings - let encoded = STANDARD.encode(data); - output.push_str(&format!( - r#""#, - width, mimetype, encoded, mimetype - )); + // + // TODO: do not support serialization of video data + unimplemented!("Video tokens are not supported for this model configuration") } // We don't create empty chunks, so this should be unreachable. None => unreachable!("Chunks should never be empty"), diff --git a/router/src/validation.rs b/router/src/validation.rs index 72ab4457..58ca966c 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -829,15 +829,6 @@ fn prepare_input( tokenizer_query.push_str(&inputs[start..chunk_start]); } let processed_video = match config { - Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => { - let default_target_width = 224; - let default_target_height = 224; - fetch_video( - &inputs[chunk_start..chunk_end], - default_target_width, - default_target_height, - )? - } Qwen2Vl(_) => { let target_width = 360; let target_height = 420; @@ -959,12 +950,9 @@ impl ChunksToString for Vec { height: _, num_frames: _, }) => { - // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings - let encoded = STANDARD.encode(data); - output.push_str(&format!( - r#""#, - width, mimetype, encoded, mimetype - )); + + // TODO: do not support serialization of video data + unimplemented!("Video tokens are not supported for this model configuration") } }); output diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index c9b24025..4efcd5a6 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch): images.append([image]) elif chunk_type == "video": if config.model_type == "qwen2_vl": - video_frame_buf = np.frombuffer( - chunk.video.data, dtype=np.uint8 - ) - num_bytes = len(video_frame_buf) - bytes_per_frame = num_bytes // chunk.video.frames - - # iterate over with a stride the size of a frame - frames = [] - for i in range(chunk.video.frames): - frame = video_frame_buf[ - i * bytes_per_frame : (i + 1) * bytes_per_frame - ] - frame = frame.reshape( - chunk.video.height, chunk.video.width, 3 + # reshape via numpy array then convert to torch tensor and permute + frame_nchw_tensor = torch.from_numpy( + np.frombuffer(chunk.video.data, dtype=np.uint8).reshape( + chunk.video.frames, + chunk.video.height, + chunk.video.width, + 3, ) - frames.append(frame) - - video_frame_buf = np.stack(frames) - frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute( - 0, 3, 1, 2 - ) + ).permute(0, 3, 1, 2) videos.append(frame_nchw_tensor) + else: + raise RuntimeError( + f"Model type {config.model_type} does not support video" + ) else: raise RuntimeError(f"Invalid chunk type {chunk_type}")