From 39d2073e936b417af170908ab4957c2fbd7c251b Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 18 Sep 2024 17:59:13 +0200 Subject: [PATCH] Preprocessing. --- .../models/custom_modeling/mllama.py | 8 +- .../models/idefics_causal_lm.py | 86 +++++++++++++------ 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/mllama.py b/server/text_generation_server/models/custom_modeling/mllama.py index 66feb902..ee06b5e7 100644 --- a/server/text_generation_server/models/custom_modeling/mllama.py +++ b/server/text_generation_server/models/custom_modeling/mllama.py @@ -14,7 +14,7 @@ # limitations under the License. """PyTorch Mllama model.""" -from typing import List, Optional, Tuple +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -23,11 +23,6 @@ import math from transformers.activations import ACT2FN import torch.nn.functional as F -from text_generation_server.models.custom_modeling.vlm import ( - load_text_model, -) -from text_generation_server.layers.attention import Seqlen -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -40,7 +35,6 @@ from text_generation_server.layers import ( SpeculativeHead, FastLinear, ) -from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py index c5480952..e5c862cc 100644 --- a/server/text_generation_server/models/idefics_causal_lm.py +++ b/server/text_generation_server/models/idefics_causal_lm.py @@ -120,33 +120,67 @@ class IdeficsCausalLMBatch(Batch): ) # TODO Check impact on idefics - prompts = [] - for inp in inputs: - # Each input is encoded into a list, where each element of this input list is either a string or a URL - prompt = [] - for chunk in inp: - chunk_type = chunk.WhichOneof("chunk") - if chunk_type == "text": - prompt.append(chunk.text) - elif chunk_type == "image": - image = Image.open(BytesIO(chunk.image.data)) - prompt.append(image) - else: - raise RuntimeError(f"Invalid chunk type {chunk_type}") - prompts.append(prompt) - # The processor replaces the call to tokenizer, and - # a/ takes care of fetching images from the URL - # b/ generate the correct input_ids, attention_mask, pixel_values, image_attention_mask to feed to the model - tokenized_inputs = processor( - prompts, - return_tensors="pt", - padding=True, - truncation=True, - max_length=max_truncation, - # TODO Check impact on idefics - # add_end_of_utterance_token=False, # Already taken care of inside the prompts, so bypassing the processor's handling of this token - ).to(device) + if config.model_type == "idefics": + prompts = [] + for inp in inputs: + # Each input is encoded into a list, where each element of this input list is either a string or a URL + prompt = [] + for chunk in inp: + chunk_type = chunk.WhichOneof("chunk") + if chunk_type == "text": + prompt.append(chunk.text) + elif chunk_type == "image": + image = Image.open(BytesIO(chunk.image.data)) + prompt.append(image) + else: + raise RuntimeError(f"Invalid chunk type {chunk_type}") + prompts.append(prompt) + + # The processor replaces the call to tokenizer, and + # a/ takes care of fetching images from the URL + # b/ generate the correct input_ids, attention_mask, pixel_values, image_attention_mask to feed to the model + tokenized_inputs = processor( + prompts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_truncation, + # TODO Check impact on idefics + # add_end_of_utterance_token=False, # Already taken care of inside the prompts, so bypassing the processor's handling of this token + ).to(device) + else: + images = [] + texts = [] + for inp in inputs: + # Each input is encoded into a list, where each element of this input list is either a string or a URL + curr_images = [] + curr_text = "" + for chunk in inp: + chunk_type = chunk.WhichOneof("chunk") + if chunk_type == "text": + curr_text += chunk.text + elif chunk_type == "image": + image = Image.open(BytesIO(chunk.image.data)) + curr_images.append(image) + # TODO unsure about BOS + curr_text += "<|image|><|begin_of_text|>" + else: + raise RuntimeError(f"Invalid chunk type {chunk_type}") + images.append(curr_images) + texts.append(curr_text) + + # The processor replaces the call to tokenizer, and + # a/ takes care of fetching images from the URL + # b/ generate the correct input_ids, attention_mask, pixel_values, image_attention_mask to feed to the model + tokenized_inputs = processor( + images=images, + text=texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_truncation, + ).to(device) for _ in pb.requests: input_len = tokenized_inputs["input_ids"].shape[1] prefix_offsets.append(