moving video sampling and resize to validation. downstream we receive frames

2025-04-20 22:32:07 +00:00 · 2024-11-22 19:20:55 +00:00 · 2024-11-22 19:20:55 +00:00 · e65ead12bb
commit e65ead12bb
parent 322165d767
5 changed files with 73 additions and 50 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -267,7 +267,7 @@ version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96"
 dependencies = [
- "bindgen",
+ "bindgen 0.69.5",
 "cc",
 "cmake",
 "dunce",
@ -454,6 +454,24 @@ dependencies = [
 "which",
 ]
 [[package]]
 name = "bindgen"
 version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
 "bitflags 2.6.0",
 "cexpr",
 "clang-sys",
 "itertools 0.13.0",
 "proc-macro2",
 "quote",
 "regex",
 "rustc-hash",
 "shlex",
 "syn 2.0.89",
 ]
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@ -1237,6 +1255,31 @@ dependencies = [
 "simd-adler32",
 ]
 [[package]]
 name = "ffmpeg-next"
 version = "7.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
 dependencies = [
 "bitflags 2.6.0",
 "ffmpeg-sys-next",
 "libc",
 ]
 [[package]]
 name = "ffmpeg-sys-next"
 version = "7.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2bc3234d0a4b2f7d083699d0860c6c9dd83713908771b60f94a96f8704adfe45"
 dependencies = [
 "bindgen 0.70.1",
 "cc",
 "libc",
 "num_cpus",
 "pkg-config",
 "vcpkg",
 ]
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@ -3014,17 +3057,6 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "outlines-core"
 version = "0.1.0"
 source = "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#ba10c619fc9bf3c487e43f49bdecb95a24bb465c"
 dependencies = [
 "anyhow",
 "regex",
 "serde-pyobject",
 "serde_json",
 ]
 [[package]]
 name = "overload"
 version = "0.1.1"
@ -3972,16 +4004,6 @@ dependencies = [
 "serde_derive",
 ]
 [[package]]
 name = "serde-pyobject"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca4b0aad8b225845739a0030a0d5cc2ae949c56a86a7daf9226c7df7c2016d16"
 dependencies = [
 "pyo3",
 "serde",
 ]
 [[package]]
 name = "serde_cbor"
 version = "0.11.2"
@ -4009,7 +4031,6 @@ version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
 "indexmap 2.6.0",
 "itoa",
 "memchr",
 "ryu",
@ -4458,7 +4479,6 @@ dependencies = [
 name = "text-generation-router"
 version = "3.0.2-dev0"
 dependencies = [
 "anyhow",
 "async-stream",
 "async-trait",
 "axum 0.7.9",
@ -4466,6 +4486,7 @@ dependencies = [
 "base64 0.22.1",
 "clap 4.5.21",
 "csv",
 "ffmpeg-next",
 "futures",
 "futures-util",
 "hf-hub",
@ -4483,7 +4504,6 @@ dependencies = [
 "once_cell",
 "opentelemetry 0.20.0",
 "opentelemetry-otlp",
 "outlines-core",
 "pyo3",
 "rand",
 "regex",
@ -4491,6 +4511,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sysinfo",
 "tempfile",
 "thiserror",
 "tokenizers",
 "tokio",
--- a/14
+++ b/14
@ -20,6 +20,20 @@ FROM chef AS builder
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
    RUN apt-get update && apt-get install -y \
    ffmpeg \
    libavcodec-dev \
    libavfilter-dev \
    libavdevice-dev \ 
    libavformat-dev \
    libavutil-dev \
    libswscale-dev \
    pkg-config \
    libclang-dev \
    clang \
    && rm -rf /var/lib/apt/lists/*
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@ -440,7 +440,7 @@ impl State {
                                    mimetype: image.mimetype,
                                }),
                                Chunk::Video(video) => client::Chunk::Video(client::Video {
-                                    data: video.data,
+                                    data: video.frames,
                                    mimetype: video.mimetype,
                                }),
                            }),
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -8,7 +8,8 @@ authors.workspace = true
 homepage.workspace = true
 [dependencies]
-anyhow = "1"
+ffmpeg-next = "7.1.0"
 tempfile = "3.10.1"
 async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
@ -23,7 +24,6 @@ metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
 outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
 rand = "0.8.5"
 reqwest = { version = "0.11.20", features = [] }
 serde = "1.0.188"
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -2,7 +2,7 @@ import torch
 from PIL import Image
 from io import BytesIO
-
+import numpy as np
 from opentelemetry import trace
 from typing import Iterable, Optional, Tuple, List, Type, Dict
@ -252,28 +252,16 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
        video_inputs = None
        if videos:
            try:
                tensor_videos = []
                video = videos[0]
-                video_buffer = BytesIO(video.data)
+                # Frames are already sampled and resized
-                video, _audio, info = io.read_video(
+                frames = [
-                    video_buffer,
+                    torch.from_numpy(np.frombuffer(frame, dtype=np.uint8).reshape(video.height, video.width, 3))
-                    start_pts=0.0,
+                    for frame in video.frames
-                    end_pts=None,
+                ]
-                    pts_unit="sec",
+                video_tensor = torch.stack(frames).permute(0, 3, 1, 2)  # NHWC -> NCHW
-                    output_format="TCHW",
+                
-                )
+                # Apply any additional preprocessing required by the model
-                total_frames, video_fps = video.size(0), info["video_fps"]
+                tensor_videos = [video_tensor]
                nframes = smart_nframes(
                    fps=30,
                    nframes=None,
                    min_frames=16,
                    max_frames=64,
                    total_frames=total_frames,
                    video_fps=video_fps,
                )
                idx = torch.linspace(0, total_frames - 1, nframes).round().long()
                video = video[idx]
                tensor_videos.append(video)
                video_inputs = processor.image_processor(
                    tensor_videos, return_tensors="pt"
                )