diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index 90eb0463..8b43ae4d 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -1013,7 +1013,7 @@ class IdeficsModel(IdeficsPreTrainedModel): position_ids = position_ids.view(-1, seq_length).long() no_images = False - + if image_hidden_states is None: if pixel_values is None and image_embeddings is None: raise ValueError("Either pixel_values and image_embeddings have to be not-None.") @@ -1040,7 +1040,7 @@ class IdeficsModel(IdeficsPreTrainedModel): image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2) image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size) else: - no_images = True + no_images = False num_images = pixel_values.shape[1] image_seq_len = image_hidden_states.shape[1] // num_images diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py index bdf1969c..9adc44d1 100644 --- a/server/text_generation_server/models/idefics_causal_lm.py +++ b/server/text_generation_server/models/idefics_causal_lm.py @@ -24,35 +24,6 @@ from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sam tracer = trace.get_tracer(__name__) -# UTILS -def base64_to_pil(encoded_image): - decoded_image = base64.b64decode(encoded_image) - pil_image = Image.open(BytesIO(decoded_image)) - return pil_image - -def im_markdown_to_pil(im_markdown_str): - pattern = r'' - match = re.search(pattern, im_markdown_str) - img_b64_str = match.group(1) - return base64_to_pil(img_b64_str) - -def split_str_on_im_markdown(string_with_potential_im_markdown): - """ - Extract from a string (typically the user prompt string) the potentional images saved as a base64 representation - inside a markdown. - """ - pattern = r'' - parts = re.split(pattern, string_with_potential_im_markdown) - result = [] - for i, part in enumerate(parts): - if i % 2 == 0: - result.append(part) - else: - img_tag = f'' - result.append(img_tag) - return result - - @dataclass class IdeficsCausalLMBatch(Batch): batch_id: int @@ -126,8 +97,8 @@ class IdeficsCausalLMBatch(Batch): r.stopping_parameters, tokenizer ) stopping_criterias.append(stopping_criteria) - max_truncation = max(max_truncation, r.truncate) #TODO: understand that - max_decode_tokens += stopping_criteria.max_new_tokens # TODO: I think it is just the maximum of tokens to generate in the WHOLE batch + max_truncation = max(max_truncation, r.truncate) + max_decode_tokens += stopping_criteria.max_new_tokens padding_right_offset = max( padding_right_offset, stopping_criteria.max_new_tokens ) @@ -145,16 +116,6 @@ class IdeficsCausalLMBatch(Batch): ) else: raise ValueError("Unsupported type of input") - # I initially wanted to send the images in string base64 but they are too big to send in a consistent way... - # So resorting to uploading the image to a server and pulling them back - # splitted_inp = split_str_on_im_markdown(inp) - # prompts.append( - # [ - # im_markdown_to_pil(s) if s.startswith('