Merge branch 'main' into warmup_gaudi_backend

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-10-19 20:05:24 +00:00 · 2025-04-14 18:24:34 -07:00 · 2025-04-14 18:24:34 -07:00 · 6b21985c95
commit 6b21985c95
parent ba049c9d49 73e797528d
6 changed files with 1059 additions and 843 deletions
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@ -11,15 +11,15 @@ repository = "https://github.com/huggingface/text-generation-inference"
 [tool.poetry.dependencies]
-python = "^3.7"
+python = "^3.9"
 pydantic = "> 2, < 3"
-aiohttp = "^3.8"
+aiohttp = "^3.11"
 huggingface-hub = ">= 0.12, < 1.0"
-[tool.poetry.dev-dependencies]
+[tool.poetry.group.dev.dependencies]
-pytest = "^6.2.5"
+pytest = "^8"
-pytest-asyncio = "^0.17.2"
+pytest-asyncio = "^0.26"
-pytest-cov = "^3.0.0"
+pytest-cov = "^6.0.0"
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -229,10 +229,13 @@ impl Llama4 {
    pub fn pixel_shuffle_ratio(&self) -> f64 {
        self.vision_config.pixel_shuffle_ratio
    }
-    pub fn get_aspect_ratios(&self, height: usize, width: usize) -> (usize, usize) {
+    pub fn get_aspect_ratios(
        &self,
        height: usize,
        width: usize,
        max_chunks: usize,
    ) -> (usize, usize) {
        let patch_size = self.vision_config.image_size;
        // How to avoid hardcoding this?
        let max_chunks = 15;
        let supported = find_supported_resolutions(max_chunks, patch_size);
        let (target_h, target_w) = get_best_fit(height, width, &supported, false);
        (target_h / patch_size, target_w / patch_size)
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -204,7 +204,7 @@ pub struct Gemma3Processor {
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Llama4Processor {
    #[serde(default)]
-    do_image_splitting: bool,
+    max_patches: usize,
 }
 #[derive(Debug, Clone, Deserialize, Default)]
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -698,10 +698,14 @@ fn image_tokens(
            let image_height = config.image_size();
            let patch_size = config.patch_size();
            let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
            let max_patches = match preprocessor_config {
                Some(HubPreprocessorConfig::Llama4Processor(cfg)) => cfg.max_patches,
                _ => panic!("Expected Llama4Processor in preprocessor_config"),
            };
            let downsample_ratio =
                (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
-            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width);
+            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width, max_patches);
            let image_width = image_height; // Assuming pixel shape: [H][W][C]
            let num_patches_per_chunk =
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -1041,7 +1041,6 @@ def get_model(
                trust_remote_code=trust_remote_code,
                processor_kwargs={
                    "use_fast": True,
                    "size": {"height": 336, "width": 336},
                },
            )
    elif model_type == BAICHUAN: