diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs
index cec820f6..9ced5a83 100644
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
                 height: _,
                 frames: _,
             })) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
-                let encoded = STANDARD.encode(data);
-                output.push_str(&format!(
-                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
-                    width, mimetype, encoded, mimetype
-                ));
+                // 
+                // TODO: do not support serialization of video data
+                unimplemented!("Video tokens are not supported for this model configuration")
             }
             // We don't create empty chunks, so this should be unreachable.
             None => unreachable!("Chunks should never be empty"),
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 72ab4457..58ca966c 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
                     tokenizer_query.push_str(&inputs[start..chunk_start]);
                 }
                 let processed_video = match config {
-                    Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
-                        let default_target_width = 224;
-                        let default_target_height = 224;
-                        fetch_video(
-                            &inputs[chunk_start..chunk_end],
-                            default_target_width,
-                            default_target_height,
-                        )?
-                    }
                     Qwen2Vl(_) => {
                         let target_width = 360;
                         let target_height = 420;
@@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
                 height: _,
                 num_frames: _,
             }) => {
-                // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
-                let encoded = STANDARD.encode(data);
-                output.push_str(&format!(
-                    r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
-                    width, mimetype, encoded, mimetype
-                ));
+
+                // TODO: do not support serialization of video data
+                unimplemented!("Video tokens are not supported for this model configuration")
             }
         });
         output
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index c9b24025..4efcd5a6 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                         images.append([image])
                 elif chunk_type == "video":
                     if config.model_type == "qwen2_vl":
-                        video_frame_buf = np.frombuffer(
-                            chunk.video.data, dtype=np.uint8
-                        )
-                        num_bytes = len(video_frame_buf)
-                        bytes_per_frame = num_bytes // chunk.video.frames
-
-                        # iterate over with a stride the size of a frame
-                        frames = []
-                        for i in range(chunk.video.frames):
-                            frame = video_frame_buf[
-                                i * bytes_per_frame : (i + 1) * bytes_per_frame
-                            ]
-                            frame = frame.reshape(
-                                chunk.video.height, chunk.video.width, 3
+                        # reshape via numpy array then convert to torch tensor and permute
+                        frame_nchw_tensor = torch.from_numpy(
+                            np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
+                                chunk.video.frames,
+                                chunk.video.height,
+                                chunk.video.width,
+                                3,
                             )
-                            frames.append(frame)
-
-                        video_frame_buf = np.stack(frames)
-                        frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
-                            0, 3, 1, 2
-                        )
+                        ).permute(0, 3, 1, 2)
                         videos.append(frame_nchw_tensor)
+                    else:
+                        raise RuntimeError(
+                            f"Model type {config.model_type} does not support video"
+                        )
                 else:
                     raise RuntimeError(f"Invalid chunk type {chunk_type}")