mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
fix: improve video processing and update unsupported paths
This commit is contained in:
parent
b27749eba7
commit
78cd756caf
@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
|
|||||||
height: _,
|
height: _,
|
||||||
frames: _,
|
frames: _,
|
||||||
})) => {
|
})) => {
|
||||||
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
|
//
|
||||||
let encoded = STANDARD.encode(data);
|
// TODO: do not support serialization of video data
|
||||||
output.push_str(&format!(
|
unimplemented!("Video tokens are not supported for this model configuration")
|
||||||
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
|
|
||||||
width, mimetype, encoded, mimetype
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
// We don't create empty chunks, so this should be unreachable.
|
// We don't create empty chunks, so this should be unreachable.
|
||||||
None => unreachable!("Chunks should never be empty"),
|
None => unreachable!("Chunks should never be empty"),
|
||||||
|
@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
|
|||||||
tokenizer_query.push_str(&inputs[start..chunk_start]);
|
tokenizer_query.push_str(&inputs[start..chunk_start]);
|
||||||
}
|
}
|
||||||
let processed_video = match config {
|
let processed_video = match config {
|
||||||
Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
|
|
||||||
let default_target_width = 224;
|
|
||||||
let default_target_height = 224;
|
|
||||||
fetch_video(
|
|
||||||
&inputs[chunk_start..chunk_end],
|
|
||||||
default_target_width,
|
|
||||||
default_target_height,
|
|
||||||
)?
|
|
||||||
}
|
|
||||||
Qwen2Vl(_) => {
|
Qwen2Vl(_) => {
|
||||||
let target_width = 360;
|
let target_width = 360;
|
||||||
let target_height = 420;
|
let target_height = 420;
|
||||||
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
|
|||||||
height: _,
|
height: _,
|
||||||
num_frames: _,
|
num_frames: _,
|
||||||
}) => {
|
}) => {
|
||||||
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
|
|
||||||
let encoded = STANDARD.encode(data);
|
// TODO: do not support serialization of video data
|
||||||
output.push_str(&format!(
|
unimplemented!("Video tokens are not supported for this model configuration")
|
||||||
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
|
|
||||||
width, mimetype, encoded, mimetype
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
output
|
output
|
||||||
|
@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
|
|||||||
images.append([image])
|
images.append([image])
|
||||||
elif chunk_type == "video":
|
elif chunk_type == "video":
|
||||||
if config.model_type == "qwen2_vl":
|
if config.model_type == "qwen2_vl":
|
||||||
video_frame_buf = np.frombuffer(
|
# reshape via numpy array then convert to torch tensor and permute
|
||||||
chunk.video.data, dtype=np.uint8
|
frame_nchw_tensor = torch.from_numpy(
|
||||||
)
|
np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
|
||||||
num_bytes = len(video_frame_buf)
|
chunk.video.frames,
|
||||||
bytes_per_frame = num_bytes // chunk.video.frames
|
chunk.video.height,
|
||||||
|
chunk.video.width,
|
||||||
# iterate over with a stride the size of a frame
|
3,
|
||||||
frames = []
|
|
||||||
for i in range(chunk.video.frames):
|
|
||||||
frame = video_frame_buf[
|
|
||||||
i * bytes_per_frame : (i + 1) * bytes_per_frame
|
|
||||||
]
|
|
||||||
frame = frame.reshape(
|
|
||||||
chunk.video.height, chunk.video.width, 3
|
|
||||||
)
|
|
||||||
frames.append(frame)
|
|
||||||
|
|
||||||
video_frame_buf = np.stack(frames)
|
|
||||||
frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
|
|
||||||
0, 3, 1, 2
|
|
||||||
)
|
)
|
||||||
|
).permute(0, 3, 1, 2)
|
||||||
videos.append(frame_nchw_tensor)
|
videos.append(frame_nchw_tensor)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Model type {config.model_type} does not support video"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Invalid chunk type {chunk_type}")
|
raise RuntimeError(f"Invalid chunk type {chunk_type}")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user