moving video sampling and resize to validation. downstream we receive frames

This commit is contained in:
Miquel Farre 2024-11-22 19:20:55 +00:00 committed by drbh
parent 322165d767
commit e65ead12bb
5 changed files with 73 additions and 50 deletions

71
Cargo.lock generated
View File

@ -267,7 +267,7 @@ version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96"
dependencies = [
"bindgen",
"bindgen 0.69.5",
"cc",
"cmake",
"dunce",
@ -454,6 +454,24 @@ dependencies = [
"which",
]
[[package]]
name = "bindgen"
version = "0.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
dependencies = [
"bitflags 2.6.0",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 2.0.89",
]
[[package]]
name = "bit-set"
version = "0.5.3"
@ -1237,6 +1255,31 @@ dependencies = [
"simd-adler32",
]
[[package]]
name = "ffmpeg-next"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
dependencies = [
"bitflags 2.6.0",
"ffmpeg-sys-next",
"libc",
]
[[package]]
name = "ffmpeg-sys-next"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bc3234d0a4b2f7d083699d0860c6c9dd83713908771b60f94a96f8704adfe45"
dependencies = [
"bindgen 0.70.1",
"cc",
"libc",
"num_cpus",
"pkg-config",
"vcpkg",
]
[[package]]
name = "fixedbitset"
version = "0.4.2"
@ -3014,17 +3057,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "outlines-core"
version = "0.1.0"
source = "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#ba10c619fc9bf3c487e43f49bdecb95a24bb465c"
dependencies = [
"anyhow",
"regex",
"serde-pyobject",
"serde_json",
]
[[package]]
name = "overload"
version = "0.1.1"
@ -3972,16 +4004,6 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-pyobject"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca4b0aad8b225845739a0030a0d5cc2ae949c56a86a7daf9226c7df7c2016d16"
dependencies = [
"pyo3",
"serde",
]
[[package]]
name = "serde_cbor"
version = "0.11.2"
@ -4009,7 +4031,6 @@ version = "1.0.133"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
dependencies = [
"indexmap 2.6.0",
"itoa",
"memchr",
"ryu",
@ -4458,7 +4479,6 @@ dependencies = [
name = "text-generation-router"
version = "3.0.2-dev0"
dependencies = [
"anyhow",
"async-stream",
"async-trait",
"axum 0.7.9",
@ -4466,6 +4486,7 @@ dependencies = [
"base64 0.22.1",
"clap 4.5.21",
"csv",
"ffmpeg-next",
"futures",
"futures-util",
"hf-hub",
@ -4483,7 +4504,6 @@ dependencies = [
"once_cell",
"opentelemetry 0.20.0",
"opentelemetry-otlp",
"outlines-core",
"pyo3",
"rand",
"regex",
@ -4491,6 +4511,7 @@ dependencies = [
"serde",
"serde_json",
"sysinfo",
"tempfile",
"thiserror",
"tokenizers",
"tokio",

View File

@ -20,6 +20,20 @@ FROM chef AS builder
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3.11-dev
RUN apt-get update && apt-get install -y \
ffmpeg \
libavcodec-dev \
libavfilter-dev \
libavdevice-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
pkg-config \
libclang-dev \
clang \
&& rm -rf /var/lib/apt/lists/*
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \

View File

@ -440,7 +440,7 @@ impl State {
mimetype: image.mimetype,
}),
Chunk::Video(video) => client::Chunk::Video(client::Video {
data: video.data,
data: video.frames,
mimetype: video.mimetype,
}),
}),

View File

@ -8,7 +8,8 @@ authors.workspace = true
homepage.workspace = true
[dependencies]
anyhow = "1"
ffmpeg-next = "7.1.0"
tempfile = "3.10.1"
async-trait = "0.1.74"
async-stream = "0.3.5"
axum = { version = "0.7", features = ["json"] }
@ -23,7 +24,6 @@ metrics-exporter-prometheus = { workspace = true }
nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.13.0"
outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
rand = "0.8.5"
reqwest = { version = "0.11.20", features = [] }
serde = "1.0.188"

View File

@ -2,7 +2,7 @@ import torch
from PIL import Image
from io import BytesIO
import numpy as np
from opentelemetry import trace
from typing import Iterable, Optional, Tuple, List, Type, Dict
@ -252,28 +252,16 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
video_inputs = None
if videos:
try:
tensor_videos = []
video = videos[0]
video_buffer = BytesIO(video.data)
video, _audio, info = io.read_video(
video_buffer,
start_pts=0.0,
end_pts=None,
pts_unit="sec",
output_format="TCHW",
)
total_frames, video_fps = video.size(0), info["video_fps"]
nframes = smart_nframes(
fps=30,
nframes=None,
min_frames=16,
max_frames=64,
total_frames=total_frames,
video_fps=video_fps,
)
idx = torch.linspace(0, total_frames - 1, nframes).round().long()
video = video[idx]
tensor_videos.append(video)
# Frames are already sampled and resized
frames = [
torch.from_numpy(np.frombuffer(frame, dtype=np.uint8).reshape(video.height, video.width, 3))
for frame in video.frames
]
video_tensor = torch.stack(frames).permute(0, 3, 1, 2) # NHWC -> NCHW
# Apply any additional preprocessing required by the model
tensor_videos = [video_tensor]
video_inputs = processor.image_processor(
tensor_videos, return_tensors="pt"
)