fix: enable all cuda graphs and bump snapshots

2025-07-29 11:20:16 +00:00 · 2025-01-23 15:32:46 +00:00 · 2025-01-23 15:32:46 +00:00 · 5f416f6e28
commit 5f416f6e28
parent eef3c7bdf2
4 changed files with 10 additions and 26 deletions
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@ -5,7 +5,7 @@
      "index": 0,
      "logprobs": null,
      "message": {
-        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance.",
+        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
@ -13,14 +13,14 @@
      "usage": null
    }
  ],
-  "created": 1737498164,
+  "created": 1737645979,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": {
-    "completion_tokens": 68,
+    "completion_tokens": 58,
    "prompt_tokens": 1364,
-    "total_tokens": 1432
+    "total_tokens": 1422
  }
 }
--- a/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
@ -11,7 +11,7 @@
      "logprobs": null
    }
  ],
-  "created": 1737498227,
+  "created": 1737646031,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion.chunk",
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@ -35,7 +35,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):

    assert (
        response.choices[0].message.content
-        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
    )

    assert response == response_snapshot
@ -72,7 +72,7 @@ async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):

    assert (
        generated
-        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
    )
-    assert count == 68
+    assert count == 58
    assert last_response == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -2056,10 +2056,6 @@ fn main() -> Result<(), LauncherError> {

    let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
    let quantize = config.as_ref().and_then(|c| c.quantize);
-    let model_type = config
-        .as_ref()
-        .and_then(|c| c.model_type.as_deref())
-        .map(|s| s.to_owned());
    // Quantization usually means you're even more RAM constrained.

    let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
@ -2148,20 +2144,8 @@ fn main() -> Result<(), LauncherError> {
            vec![]
        }
        _ => {
-            let default_cuda_graphs = vec![1, 2, 4, 8, 16, 32];
-            tracing::info!("Using default CUDA graphs: {:?}", default_cuda_graphs);
-            let cuda_graphs = match model_type.as_deref() {
-                Some("qwen2_vl") => {
-                    tracing::warn!(
-                        "Qwen VL model detected - restricting CUDA graphs to values >= 3"
-                    );
-                    default_cuda_graphs
-                        .into_iter()
-                        .filter(|&c| c >= 3)
-                        .collect()
-                }
-                _ => default_cuda_graphs,
-            };
+            let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
+            tracing::info!("Using default cuda graphs {cuda_graphs:?}");
            cuda_graphs
        }
    };