From b27749eba70f5309615900c8960a25bd2a4fbcc7 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 3 Jan 2025 11:01:07 -0500
Subject: [PATCH] fix: small refactor and cleanups

---
 integration-tests/models/test_flash_qwen2_vl_video.py | 11 +----------
 router/Cargo.toml                                     |  6 +++---
 router/src/validation.rs                              |  2 --
 server/text_generation_server/models/vlm_causal_lm.py |  5 -----
 4 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/integration-tests/models/test_flash_qwen2_vl_video.py b/integration-tests/models/test_flash_qwen2_vl_video.py
index b5b7dc01..79eea3c7 100644
--- a/integration-tests/models/test_flash_qwen2_vl_video.py
+++ b/integration-tests/models/test_flash_qwen2_vl_video.py
@@ -71,14 +71,5 @@ async def test_qwen2_vl_simpl(qwen2_vl, response_snapshot):
                     full_text += response["choices"][0]["delta"]["content"]
                 except json.JSONDecodeError:
                     pass
-    # assert count == 27
-    # assert response.usage == {
-    #     "completion_tokens": 10,
-    #     "prompt_tokens": 50,
-    #     "total_tokens": 60,
-    # }
-    # assert (
-    #     response.choices[0].message.content
-    #     == "In a bustling city, a chicken named Cluck"
-    # )
+
     assert last_response == response_snapshot
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 3d636a18..f35428e9 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -21,6 +21,7 @@ itertools = "0.10"
 jsonschema = { version = "0.17.1", features = ["draft202012"] }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
+mp4parse = { version = "0.17.0", optional = true }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
@@ -29,7 +30,7 @@ rand = "0.8.5"
 reqwest = { version = "0.11.20", features = [] }
 serde = "1.0.188"
 serde_json = "1.0.107"
-tempfile = "3.10.1"
+tempfile = { version = "3.10.1", optional = true }
 thiserror = "1.0.48"
 tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = [
@@ -66,7 +67,6 @@ uuid = { version = "1.9.1", default-features = false, features = [
 csv = "1.3.0"
 ureq = "=2.9"
 pyo3 = { workspace = true }
-mp4parse = "0.17.0"
 
 
 [build-dependencies]
@@ -77,4 +77,4 @@ default = ["ngrok"]
 ngrok = ["dep:ngrok"]
 google = []
 kserve = []
-video = ["ffmpeg-next"]
+video = ["ffmpeg-next", "mp4parse", "tempfile"]
diff --git a/router/src/validation.rs b/router/src/validation.rs
index a3775b17..72ab4457 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -21,7 +21,6 @@ use tokio::sync::mpsc;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
 use {once_cell::sync::Lazy, regex::Regex};
-// video processing
 
 #[cfg(feature = "video")]
 use ffmpeg_next::{
@@ -772,7 +771,6 @@ fn video_tokens(config: &Config, height: u32, width: u32, sampled_frames: f32) -
     use Config::*;
 
     match config {
-        // TOOD: improve to use the config to better estimate the number of tokens
         Qwen2Vl(_config) => {
             let min_frames = 2_f32;
             let max_frames = 256_f32;
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 2a87a25d..c9b24025 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -80,11 +80,6 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
 
 def video_text_replacement(processor, video_input, config) -> str:
     if config.model_type == "qwen2_vl":
-        # num_pads = video_input['pixel_values'].size(0)
-        # num_pads = 1206
-
-        # import ipdb; ipdb.set_trace()
-        # num_pads = 9556 + 10
         num_pads = video_input.pixel_values.shape[0] // 4
         padding = "<|video_pad|>" * num_pads
         return f"<|vision_start|>{padding}<|vision_end|>"