moving video sampling and resize to validation. downstream we receive frames

This commit is contained in:
Miquel Farre 2024-11-22 19:20:55 +00:00 committed by drbh
parent 322165d767
commit e65ead12bb
5 changed files with 73 additions and 50 deletions

71
Cargo.lock generated
View File

@ -267,7 +267,7 @@ version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96" checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96"
dependencies = [ dependencies = [
"bindgen", "bindgen 0.69.5",
"cc", "cc",
"cmake", "cmake",
"dunce", "dunce",
@ -454,6 +454,24 @@ dependencies = [
"which", "which",
] ]
[[package]]
name = "bindgen"
version = "0.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
dependencies = [
"bitflags 2.6.0",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 2.0.89",
]
[[package]] [[package]]
name = "bit-set" name = "bit-set"
version = "0.5.3" version = "0.5.3"
@ -1237,6 +1255,31 @@ dependencies = [
"simd-adler32", "simd-adler32",
] ]
[[package]]
name = "ffmpeg-next"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
dependencies = [
"bitflags 2.6.0",
"ffmpeg-sys-next",
"libc",
]
[[package]]
name = "ffmpeg-sys-next"
version = "7.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bc3234d0a4b2f7d083699d0860c6c9dd83713908771b60f94a96f8704adfe45"
dependencies = [
"bindgen 0.70.1",
"cc",
"libc",
"num_cpus",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "fixedbitset" name = "fixedbitset"
version = "0.4.2" version = "0.4.2"
@ -3014,17 +3057,6 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "outlines-core"
version = "0.1.0"
source = "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#ba10c619fc9bf3c487e43f49bdecb95a24bb465c"
dependencies = [
"anyhow",
"regex",
"serde-pyobject",
"serde_json",
]
[[package]] [[package]]
name = "overload" name = "overload"
version = "0.1.1" version = "0.1.1"
@ -3972,16 +4004,6 @@ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]]
name = "serde-pyobject"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca4b0aad8b225845739a0030a0d5cc2ae949c56a86a7daf9226c7df7c2016d16"
dependencies = [
"pyo3",
"serde",
]
[[package]] [[package]]
name = "serde_cbor" name = "serde_cbor"
version = "0.11.2" version = "0.11.2"
@ -4009,7 +4031,6 @@ version = "1.0.133"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
dependencies = [ dependencies = [
"indexmap 2.6.0",
"itoa", "itoa",
"memchr", "memchr",
"ryu", "ryu",
@ -4458,7 +4479,6 @@ dependencies = [
name = "text-generation-router" name = "text-generation-router"
version = "3.0.2-dev0" version = "3.0.2-dev0"
dependencies = [ dependencies = [
"anyhow",
"async-stream", "async-stream",
"async-trait", "async-trait",
"axum 0.7.9", "axum 0.7.9",
@ -4466,6 +4486,7 @@ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"clap 4.5.21", "clap 4.5.21",
"csv", "csv",
"ffmpeg-next",
"futures", "futures",
"futures-util", "futures-util",
"hf-hub", "hf-hub",
@ -4483,7 +4504,6 @@ dependencies = [
"once_cell", "once_cell",
"opentelemetry 0.20.0", "opentelemetry 0.20.0",
"opentelemetry-otlp", "opentelemetry-otlp",
"outlines-core",
"pyo3", "pyo3",
"rand", "rand",
"regex", "regex",
@ -4491,6 +4511,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"sysinfo", "sysinfo",
"tempfile",
"thiserror", "thiserror",
"tokenizers", "tokenizers",
"tokio", "tokio",

View File

@ -20,6 +20,20 @@ FROM chef AS builder
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3.11-dev python3.11-dev
RUN apt-get update && apt-get install -y \
ffmpeg \
libavcodec-dev \
libavfilter-dev \
libavdevice-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
pkg-config \
libclang-dev \
clang \
&& rm -rf /var/lib/apt/lists/*
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \

View File

@ -440,7 +440,7 @@ impl State {
mimetype: image.mimetype, mimetype: image.mimetype,
}), }),
Chunk::Video(video) => client::Chunk::Video(client::Video { Chunk::Video(video) => client::Chunk::Video(client::Video {
data: video.data, data: video.frames,
mimetype: video.mimetype, mimetype: video.mimetype,
}), }),
}), }),

View File

@ -8,7 +8,8 @@ authors.workspace = true
homepage.workspace = true homepage.workspace = true
[dependencies] [dependencies]
anyhow = "1" ffmpeg-next = "7.1.0"
tempfile = "3.10.1"
async-trait = "0.1.74" async-trait = "0.1.74"
async-stream = "0.3.5" async-stream = "0.3.5"
axum = { version = "0.7", features = ["json"] } axum = { version = "0.7", features = ["json"] }
@ -23,7 +24,6 @@ metrics-exporter-prometheus = { workspace = true }
nohash-hasher = "0.2.0" nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] } opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.13.0" opentelemetry-otlp = "0.13.0"
outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
rand = "0.8.5" rand = "0.8.5"
reqwest = { version = "0.11.20", features = [] } reqwest = { version = "0.11.20", features = [] }
serde = "1.0.188" serde = "1.0.188"

View File

@ -2,7 +2,7 @@ import torch
from PIL import Image from PIL import Image
from io import BytesIO from io import BytesIO
import numpy as np
from opentelemetry import trace from opentelemetry import trace
from typing import Iterable, Optional, Tuple, List, Type, Dict from typing import Iterable, Optional, Tuple, List, Type, Dict
@ -252,28 +252,16 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
video_inputs = None video_inputs = None
if videos: if videos:
try: try:
tensor_videos = []
video = videos[0] video = videos[0]
video_buffer = BytesIO(video.data) # Frames are already sampled and resized
video, _audio, info = io.read_video( frames = [
video_buffer, torch.from_numpy(np.frombuffer(frame, dtype=np.uint8).reshape(video.height, video.width, 3))
start_pts=0.0, for frame in video.frames
end_pts=None, ]
pts_unit="sec", video_tensor = torch.stack(frames).permute(0, 3, 1, 2) # NHWC -> NCHW
output_format="TCHW",
) # Apply any additional preprocessing required by the model
total_frames, video_fps = video.size(0), info["video_fps"] tensor_videos = [video_tensor]
nframes = smart_nframes(
fps=30,
nframes=None,
min_frames=16,
max_frames=64,
total_frames=total_frames,
video_fps=video_fps,
)
idx = torch.linspace(0, total_frames - 1, nframes).round().long()
video = video[idx]
tensor_videos.append(video)
video_inputs = processor.image_processor( video_inputs = processor.image_processor(
tensor_videos, return_tensors="pt" tensor_videos, return_tensors="pt"
) )