moving video sampling and resize to validation. downstream we receive frames

2025-09-09 19:34:53 +00:00 · 2024-11-22 19:20:55 +00:00 · 2024-11-22 19:20:55 +00:00 · e65ead12bb
commit e65ead12bb
parent 322165d767
5 changed files with 73 additions and 50 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -267,7 +267,7 @@ version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96"
 dependencies = [
- "bindgen",
+ "bindgen 0.69.5",
 "cc",
 "cmake",
 "dunce",
@ -454,6 +454,24 @@ dependencies = [
 "which",
 ]

+[[package]]
+name = "bindgen"
+version = "0.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
+dependencies = [
+ "bitflags 2.6.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@ -1237,6 +1255,31 @@ dependencies = [
 "simd-adler32",
 ]

+[[package]]
+name = "ffmpeg-next"
+version = "7.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
+dependencies = [
+ "bitflags 2.6.0",
+ "ffmpeg-sys-next",
+ "libc",
+]
+
+[[package]]
+name = "ffmpeg-sys-next"
+version = "7.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bc3234d0a4b2f7d083699d0860c6c9dd83713908771b60f94a96f8704adfe45"
+dependencies = [
+ "bindgen 0.70.1",
+ "cc",
+ "libc",
+ "num_cpus",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@ -3014,17 +3057,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "outlines-core"
-version = "0.1.0"
-source = "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#ba10c619fc9bf3c487e43f49bdecb95a24bb465c"
-dependencies = [
- "anyhow",
- "regex",
- "serde-pyobject",
- "serde_json",
-]
-
 [[package]]
 name = "overload"
 version = "0.1.1"
@ -3972,16 +4004,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde-pyobject"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca4b0aad8b225845739a0030a0d5cc2ae949c56a86a7daf9226c7df7c2016d16"
-dependencies = [
- "pyo3",
- "serde",
-]
-
 [[package]]
 name = "serde_cbor"
 version = "0.11.2"
@ -4009,7 +4031,6 @@ version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
- "indexmap 2.6.0",
 "itoa",
 "memchr",
 "ryu",
@ -4458,7 +4479,6 @@ dependencies = [
 name = "text-generation-router"
 version = "3.0.2-dev0"
 dependencies = [
- "anyhow",
 "async-stream",
 "async-trait",
 "axum 0.7.9",
@ -4466,6 +4486,7 @@ dependencies = [
 "base64 0.22.1",
 "clap 4.5.21",
 "csv",
+ "ffmpeg-next",
 "futures",
 "futures-util",
 "hf-hub",
@ -4483,7 +4504,6 @@ dependencies = [
 "once_cell",
 "opentelemetry 0.20.0",
 "opentelemetry-otlp",
- "outlines-core",
 "pyo3",
 "rand",
 "regex",
@ -4491,6 +4511,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sysinfo",
+ "tempfile",
 "thiserror",
 "tokenizers",
 "tokio",
--- a/14
+++ b/14
@ -20,6 +20,20 @@ FROM chef AS builder

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
+
+    RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libavcodec-dev \
+    libavfilter-dev \
+    libavdevice-dev \ 
+    libavformat-dev \
+    libavutil-dev \
+    libswscale-dev \
+    pkg-config \
+    libclang-dev \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@ -440,7 +440,7 @@ impl State {
                                    mimetype: image.mimetype,
                                }),
                                Chunk::Video(video) => client::Chunk::Video(client::Video {
-                                    data: video.data,
+                                    data: video.frames,
                                    mimetype: video.mimetype,
                                }),
                            }),
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -8,7 +8,8 @@ authors.workspace = true
 homepage.workspace = true

 [dependencies]
-anyhow = "1"
+ffmpeg-next = "7.1.0"
+tempfile = "3.10.1"
 async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
@ -23,7 +24,6 @@ metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
-outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
 rand = "0.8.5"
 reqwest = { version = "0.11.20", features = [] }
 serde = "1.0.188"
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -2,7 +2,7 @@ import torch
 from PIL import Image
 from io import BytesIO

-
+import numpy as np
 from opentelemetry import trace
 from typing import Iterable, Optional, Tuple, List, Type, Dict

@ -252,28 +252,16 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
        video_inputs = None
        if videos:
            try:
-                tensor_videos = []
                video = videos[0]
-                video_buffer = BytesIO(video.data)
-                video, _audio, info = io.read_video(
-                    video_buffer,
-                    start_pts=0.0,
-                    end_pts=None,
-                    pts_unit="sec",
-                    output_format="TCHW",
-                )
-                total_frames, video_fps = video.size(0), info["video_fps"]
-                nframes = smart_nframes(
-                    fps=30,
-                    nframes=None,
-                    min_frames=16,
-                    max_frames=64,
-                    total_frames=total_frames,
-                    video_fps=video_fps,
-                )
-                idx = torch.linspace(0, total_frames - 1, nframes).round().long()
-                video = video[idx]
-                tensor_videos.append(video)
+                # Frames are already sampled and resized
+                frames = [
+                    torch.from_numpy(np.frombuffer(frame, dtype=np.uint8).reshape(video.height, video.width, 3))
+                    for frame in video.frames
+                ]
+                video_tensor = torch.stack(frames).permute(0, 3, 1, 2)  # NHWC -> NCHW
+                
+                # Apply any additional preprocessing required by the model
+                tensor_videos = [video_tensor]
                video_inputs = processor.image_processor(
                    tensor_videos, return_tensors="pt"
                )