mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
fix: improve video processing and update unsupported paths
This commit is contained in:
parent
b27749eba7
commit
78cd756caf
@ -86,12 +86,9 @@ impl ChunksToString for Vec<InputChunk> {
|
||||
height: _,
|
||||
frames: _,
|
||||
})) => {
|
||||
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
|
||||
let encoded = STANDARD.encode(data);
|
||||
output.push_str(&format!(
|
||||
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
|
||||
width, mimetype, encoded, mimetype
|
||||
));
|
||||
//
|
||||
// TODO: do not support serialization of video data
|
||||
unimplemented!("Video tokens are not supported for this model configuration")
|
||||
}
|
||||
// We don't create empty chunks, so this should be unreachable.
|
||||
None => unreachable!("Chunks should never be empty"),
|
||||
|
@ -829,15 +829,6 @@ fn prepare_input<T: TokenizerTrait>(
|
||||
tokenizer_query.push_str(&inputs[start..chunk_start]);
|
||||
}
|
||||
let processed_video = match config {
|
||||
Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) => {
|
||||
let default_target_width = 224;
|
||||
let default_target_height = 224;
|
||||
fetch_video(
|
||||
&inputs[chunk_start..chunk_end],
|
||||
default_target_width,
|
||||
default_target_height,
|
||||
)?
|
||||
}
|
||||
Qwen2Vl(_) => {
|
||||
let target_width = 360;
|
||||
let target_height = 420;
|
||||
@ -959,12 +950,9 @@ impl ChunksToString for Vec<Chunk> {
|
||||
height: _,
|
||||
num_frames: _,
|
||||
}) => {
|
||||
// TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings
|
||||
let encoded = STANDARD.encode(data);
|
||||
output.push_str(&format!(
|
||||
r#"<video width="{}"><source src="data:{};base64,{}" type="{}"></video>"#,
|
||||
width, mimetype, encoded, mimetype
|
||||
));
|
||||
|
||||
// TODO: do not support serialization of video data
|
||||
unimplemented!("Video tokens are not supported for this model configuration")
|
||||
}
|
||||
});
|
||||
output
|
||||
|
@ -234,28 +234,20 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
|
||||
images.append([image])
|
||||
elif chunk_type == "video":
|
||||
if config.model_type == "qwen2_vl":
|
||||
video_frame_buf = np.frombuffer(
|
||||
chunk.video.data, dtype=np.uint8
|
||||
)
|
||||
num_bytes = len(video_frame_buf)
|
||||
bytes_per_frame = num_bytes // chunk.video.frames
|
||||
|
||||
# iterate over with a stride the size of a frame
|
||||
frames = []
|
||||
for i in range(chunk.video.frames):
|
||||
frame = video_frame_buf[
|
||||
i * bytes_per_frame : (i + 1) * bytes_per_frame
|
||||
]
|
||||
frame = frame.reshape(
|
||||
chunk.video.height, chunk.video.width, 3
|
||||
# reshape via numpy array then convert to torch tensor and permute
|
||||
frame_nchw_tensor = torch.from_numpy(
|
||||
np.frombuffer(chunk.video.data, dtype=np.uint8).reshape(
|
||||
chunk.video.frames,
|
||||
chunk.video.height,
|
||||
chunk.video.width,
|
||||
3,
|
||||
)
|
||||
frames.append(frame)
|
||||
|
||||
video_frame_buf = np.stack(frames)
|
||||
frame_nchw_tensor = torch.from_numpy(video_frame_buf).permute(
|
||||
0, 3, 1, 2
|
||||
)
|
||||
).permute(0, 3, 1, 2)
|
||||
videos.append(frame_nchw_tensor)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Model type {config.model_type} does not support video"
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Invalid chunk type {chunk_type}")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user