mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-29 19:30:16 +00:00
fix: enable all cuda graphs and bump snapshots
This commit is contained in:
parent
eef3c7bdf2
commit
5f416f6e28
@ -5,7 +5,7 @@
|
|||||||
"index": 0,
|
"index": 0,
|
||||||
"logprobs": null,
|
"logprobs": null,
|
||||||
"message": {
|
"message": {
|
||||||
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance.",
|
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
|
||||||
"name": null,
|
"name": null,
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"tool_calls": null
|
"tool_calls": null
|
||||||
@ -13,14 +13,14 @@
|
|||||||
"usage": null
|
"usage": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"created": 1737498164,
|
"created": 1737645979,
|
||||||
"id": "",
|
"id": "",
|
||||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.0.2-dev0-native",
|
"system_fingerprint": "3.0.2-dev0-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 68,
|
"completion_tokens": 58,
|
||||||
"prompt_tokens": 1364,
|
"prompt_tokens": 1364,
|
||||||
"total_tokens": 1432
|
"total_tokens": 1422
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
"logprobs": null
|
"logprobs": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"created": 1737498227,
|
"created": 1737646031,
|
||||||
"id": "",
|
"id": "",
|
||||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
|
@ -35,7 +35,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
|
|||||||
|
|
||||||
assert (
|
assert (
|
||||||
response.choices[0].message.content
|
response.choices[0].message.content
|
||||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
|
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert response == response_snapshot
|
assert response == response_snapshot
|
||||||
@ -72,7 +72,7 @@ async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
|
|||||||
|
|
||||||
assert (
|
assert (
|
||||||
generated
|
generated
|
||||||
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The hitch is in a dynamic pose, with its hands on its hips and legs slightly apart, giving it an imposing stance."
|
== "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
||||||
)
|
)
|
||||||
assert count == 68
|
assert count == 58
|
||||||
assert last_response == response_snapshot
|
assert last_response == response_snapshot
|
||||||
|
@ -2056,10 +2056,6 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
|
|
||||||
let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
|
let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
|
||||||
let quantize = config.as_ref().and_then(|c| c.quantize);
|
let quantize = config.as_ref().and_then(|c| c.quantize);
|
||||||
let model_type = config
|
|
||||||
.as_ref()
|
|
||||||
.and_then(|c| c.model_type.as_deref())
|
|
||||||
.map(|s| s.to_owned());
|
|
||||||
// Quantization usually means you're even more RAM constrained.
|
// Quantization usually means you're even more RAM constrained.
|
||||||
|
|
||||||
let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
|
let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
|
||||||
@ -2148,20 +2144,8 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
vec![]
|
vec![]
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let default_cuda_graphs = vec![1, 2, 4, 8, 16, 32];
|
let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
|
||||||
tracing::info!("Using default CUDA graphs: {:?}", default_cuda_graphs);
|
tracing::info!("Using default cuda graphs {cuda_graphs:?}");
|
||||||
let cuda_graphs = match model_type.as_deref() {
|
|
||||||
Some("qwen2_vl") => {
|
|
||||||
tracing::warn!(
|
|
||||||
"Qwen VL model detected - restricting CUDA graphs to values >= 3"
|
|
||||||
);
|
|
||||||
default_cuda_graphs
|
|
||||||
.into_iter()
|
|
||||||
.filter(|&c| c >= 3)
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
_ => default_cuda_graphs,
|
|
||||||
};
|
|
||||||
cuda_graphs
|
cuda_graphs
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user