fix: enable all cuda graphs and bump snapshots

This commit is contained in:
drbh 2025-01-23 15:32:46 +00:00
parent eef3c7bdf2
commit 5f416f6e28
4 changed files with 10 additions and 26 deletions

View File

@ -5,7 +5,7 @@
"index": 0,
"logprobs": null,
"message": {
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance.",
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
"name": null,
"role": "assistant",
"tool_calls": null
@ -13,14 +13,14 @@
"usage": null
}
],
"created": 1737498164,
"created": 1737645979,
"id": "",
"model": "Qwen/Qwen2-VL-7B-Instruct",
"object": "chat.completion",
"system_fingerprint": "3.0.2-dev0-native",
"usage": {
"completion_tokens": 68,
"completion_tokens": 58,
"prompt_tokens": 1364,
"total_tokens": 1432
"total_tokens": 1422
}
}

View File

@ -11,7 +11,7 @@
"logprobs": null
}
],
"created": 1737498227,
"created": 1737646031,
"id": "",
"model": "Qwen/Qwen2-VL-7B-Instruct",
"object": "chat.completion.chunk",

View File

@ -35,7 +35,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
assert (
response.choices[0].message.content
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
)
assert response == response_snapshot
@ -72,7 +72,7 @@ async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
assert (
generated
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
)
assert count == 68
assert count == 58
assert last_response == response_snapshot

View File

@ -2056,10 +2056,6 @@ fn main() -> Result<(), LauncherError> {
let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
let quantize = config.as_ref().and_then(|c| c.quantize);
let model_type = config
.as_ref()
.and_then(|c| c.model_type.as_deref())
.map(|s| s.to_owned());
// Quantization usually means you're even more RAM constrained.
let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
@ -2148,20 +2144,8 @@ fn main() -> Result<(), LauncherError> {
vec![]
}
_ => {
let default_cuda_graphs = vec![1, 2, 4, 8, 16, 32];
tracing::info!("Using default CUDA graphs: {:?}", default_cuda_graphs);
let cuda_graphs = match model_type.as_deref() {
Some("qwen2_vl") => {
tracing::warn!(
"Qwen VL model detected - restricting CUDA graphs to values >= 3"
);
default_cuda_graphs
.into_iter()
.filter(|&c| c >= 3)
.collect()
}
_ => default_cuda_graphs,
};
let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
tracing::info!("Using default cuda graphs {cuda_graphs:?}");
cuda_graphs
}
};