mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-29 11:20:16 +00:00
fix: enable all cuda graphs and bump snapshots
This commit is contained in:
parent
eef3c7bdf2
commit
5f416f6e28
@ -5,7 +5,7 @@
|
||||
"index": 0,
|
||||
"logprobs": null,
|
||||
"message": {
|
||||
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance.",
|
||||
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
|
||||
"name": null,
|
||||
"role": "assistant",
|
||||
"tool_calls": null
|
||||
@ -13,14 +13,14 @@
|
||||
"usage": null
|
||||
}
|
||||
],
|
||||
"created": 1737498164,
|
||||
"created": 1737645979,
|
||||
"id": "",
|
||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||
"object": "chat.completion",
|
||||
"system_fingerprint": "3.0.2-dev0-native",
|
||||
"usage": {
|
||||
"completion_tokens": 68,
|
||||
"completion_tokens": 58,
|
||||
"prompt_tokens": 1364,
|
||||
"total_tokens": 1432
|
||||
"total_tokens": 1422
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
"logprobs": null
|
||||
}
|
||||
],
|
||||
"created": 1737498227,
|
||||
"created": 1737646031,
|
||||
"id": "",
|
||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||
"object": "chat.completion.chunk",
|
||||
|
@ -35,7 +35,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
|
||||
|
||||
assert (
|
||||
response.choices[0].message.content
|
||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
|
||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
||||
)
|
||||
|
||||
assert response == response_snapshot
|
||||
@ -72,7 +72,7 @@ async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
|
||||
|
||||
assert (
|
||||
generated
|
||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
|
||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
||||
)
|
||||
assert count == 68
|
||||
assert count == 58
|
||||
assert last_response == response_snapshot
|
||||
|
@ -2056,10 +2056,6 @@ fn main() -> Result<(), LauncherError> {
|
||||
|
||||
let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
|
||||
let quantize = config.as_ref().and_then(|c| c.quantize);
|
||||
let model_type = config
|
||||
.as_ref()
|
||||
.and_then(|c| c.model_type.as_deref())
|
||||
.map(|s| s.to_owned());
|
||||
// Quantization usually means you're even more RAM constrained.
|
||||
|
||||
let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
|
||||
@ -2148,20 +2144,8 @@ fn main() -> Result<(), LauncherError> {
|
||||
vec![]
|
||||
}
|
||||
_ => {
|
||||
let default_cuda_graphs = vec![1, 2, 4, 8, 16, 32];
|
||||
tracing::info!("Using default CUDA graphs: {:?}", default_cuda_graphs);
|
||||
let cuda_graphs = match model_type.as_deref() {
|
||||
Some("qwen2_vl") => {
|
||||
tracing::warn!(
|
||||
"Qwen VL model detected - restricting CUDA graphs to values >= 3"
|
||||
);
|
||||
default_cuda_graphs
|
||||
.into_iter()
|
||||
.filter(|&c| c >= 3)
|
||||
.collect()
|
||||
}
|
||||
_ => default_cuda_graphs,
|
||||
};
|
||||
let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
|
||||
tracing::info!("Using default cuda graphs {cuda_graphs:?}");
|
||||
cuda_graphs
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user