fix: update all vlm forward args, pass shared libraries to final layer in docker and doc bump

This commit is contained in:
drbh 2024-12-12 22:00:02 +00:00
parent 1d6bf243eb
commit 2ae152a188
7 changed files with 19 additions and 2 deletions

View File

@ -330,6 +330,10 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/loca
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
# Copy the ffmpeg libraries
COPY --from=builder /usr/lib/x86_64-linux-gnu/* /usr/lib/x86_64-linux-gnu-copy/
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/lib/x86_64-linux-gnu-copy"
# AWS Sagemaker compatible image # AWS Sagemaker compatible image
FROM base AS sagemaker FROM base AS sagemaker

View File

@ -232,6 +232,10 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/loca
# Install launcher # Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
# Copy the ffmpeg libraries
COPY --from=builder /usr/lib/x86_64-linux-gnu/* /usr/lib/x86_64-linux-gnu-copy/
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/lib/x86_64-linux-gnu-copy"
FROM ${PLATFORM} AS final FROM ${PLATFORM} AS final
ENV ATTENTION=paged ENV ATTENTION=paged
ENV PREFIX_CACHING=0 ENV PREFIX_CACHING=0

View File

@ -81,6 +81,8 @@ class PaliGemmaForConditionalGeneration(nn.Module):
image_sizes: Optional[torch.Tensor] = None, image_sizes: Optional[torch.Tensor] = None,
adapter_data: Optional[torch.Tensor] = None, adapter_data: Optional[torch.Tensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None,
video_pixel_values: Optional[torch.FloatTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
inputs_embeds = self.text_model.embed_tokens(input_ids) inputs_embeds = self.text_model.embed_tokens(input_ids)
# TODO This is odd but apparently pali gemma position ids start at 1. # TODO This is odd but apparently pali gemma position ids start at 1.

View File

@ -751,6 +751,8 @@ class Idefics2ForConditionalGeneration(nn.Module):
image_sizes: Optional[torch.Tensor] = None, image_sizes: Optional[torch.Tensor] = None,
adapter_data: Optional[torch.Tensor] = None, adapter_data: Optional[torch.Tensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None,
video_pixel_values: Optional[torch.FloatTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
): ):
inputs_embeds = self.text_model.embed_tokens(input_ids) inputs_embeds = self.text_model.embed_tokens(input_ids)
if pixel_values is not None: if pixel_values is not None:

View File

@ -181,6 +181,8 @@ class LlavaNextForConditionalGeneration(nn.Module):
image_sizes: Optional[torch.LongTensor] = None, image_sizes: Optional[torch.LongTensor] = None,
adapter_data: Optional[torch.Tensor] = None, adapter_data: Optional[torch.Tensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None,
video_pixel_values: Optional[torch.FloatTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
): ):
inputs_embeds = self.text_model.embed_tokens(input_ids) inputs_embeds = self.text_model.embed_tokens(input_ids)
if pixel_values is not None and len(pixel_values) > 0: if pixel_values is not None and len(pixel_values) > 0:

View File

@ -148,7 +148,8 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
if image_inputs is not None: if image_inputs is not None:
assert len(image_indices) == image_inputs["pixel_values"].shape[0] assert len(image_indices) == image_inputs["pixel_values"].shape[0]
return batch_tokenized_inputs, image_inputs video_inputs = None
return batch_tokenized_inputs, image_inputs, video_inputs
@classmethod @classmethod
def from_pb_processor( def from_pb_processor(

View File

@ -68,4 +68,6 @@ class PaliGemmaBatch(VlmCausalLMBatch):
image_inputs = new_image_inputs image_inputs = new_image_inputs
else: else:
image_inputs = None image_inputs = None
return batch_tokenized_inputs, image_inputs
video_inputs = None
return batch_tokenized_inputs, image_inputs, video_inputs