From b27749eba70f5309615900c8960a25bd2a4fbcc7 Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 3 Jan 2025 11:01:07 -0500 Subject: [PATCH] fix: small refactor and cleanups --- integration-tests/models/test_flash_qwen2_vl_video.py | 11 +---------- router/Cargo.toml | 6 +++--- router/src/validation.rs | 2 -- server/text_generation_server/models/vlm_causal_lm.py | 5 ----- 4 files changed, 4 insertions(+), 20 deletions(-) diff --git a/integration-tests/models/test_flash_qwen2_vl_video.py b/integration-tests/models/test_flash_qwen2_vl_video.py index b5b7dc01..79eea3c7 100644 --- a/integration-tests/models/test_flash_qwen2_vl_video.py +++ b/integration-tests/models/test_flash_qwen2_vl_video.py @@ -71,14 +71,5 @@ async def test_qwen2_vl_simpl(qwen2_vl, response_snapshot): full_text += response["choices"][0]["delta"]["content"] except json.JSONDecodeError: pass - # assert count == 27 - # assert response.usage == { - # "completion_tokens": 10, - # "prompt_tokens": 50, - # "total_tokens": 60, - # } - # assert ( - # response.choices[0].message.content - # == "In a bustling city, a chicken named Cluck" - # ) + assert last_response == response_snapshot diff --git a/router/Cargo.toml b/router/Cargo.toml index 3d636a18..f35428e9 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -21,6 +21,7 @@ itertools = "0.10" jsonschema = { version = "0.17.1", features = ["draft202012"] } metrics = { workspace = true } metrics-exporter-prometheus = { workspace = true } +mp4parse = { version = "0.17.0", optional = true } nohash-hasher = "0.2.0" opentelemetry = { version = "0.20.0", features = ["rt-tokio"] } outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" } @@ -29,7 +30,7 @@ rand = "0.8.5" reqwest = { version = "0.11.20", features = [] } serde = "1.0.188" serde_json = "1.0.107" -tempfile = "3.10.1" +tempfile = { version = "3.10.1", optional = true } thiserror = "1.0.48" tokenizers = { workspace = true } tokio = { version = "1.32.0", features = [ @@ -66,7 +67,6 @@ uuid = { version = "1.9.1", default-features = false, features = [ csv = "1.3.0" ureq = "=2.9" pyo3 = { workspace = true } -mp4parse = "0.17.0" [build-dependencies] @@ -77,4 +77,4 @@ default = ["ngrok"] ngrok = ["dep:ngrok"] google = [] kserve = [] -video = ["ffmpeg-next"] +video = ["ffmpeg-next", "mp4parse", "tempfile"] diff --git a/router/src/validation.rs b/router/src/validation.rs index a3775b17..72ab4457 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -21,7 +21,6 @@ use tokio::sync::mpsc; use tokio::sync::oneshot; use tracing::{instrument, Span}; use {once_cell::sync::Lazy, regex::Regex}; -// video processing #[cfg(feature = "video")] use ffmpeg_next::{ @@ -772,7 +771,6 @@ fn video_tokens(config: &Config, height: u32, width: u32, sampled_frames: f32) - use Config::*; match config { - // TOOD: improve to use the config to better estimate the number of tokens Qwen2Vl(_config) => { let min_frames = 2_f32; let max_frames = 256_f32; diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 2a87a25d..c9b24025 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -80,11 +80,6 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str def video_text_replacement(processor, video_input, config) -> str: if config.model_type == "qwen2_vl": - # num_pads = video_input['pixel_values'].size(0) - # num_pads = 1206 - - # import ipdb; ipdb.set_trace() - # num_pads = 9556 + 10 num_pads = video_input.pixel_values.shape[0] // 4 padding = "<|video_pad|>" * num_pads return f"<|vision_start|>{padding}<|vision_end|>"