diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 90eb0463..8b43ae4d 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -1013,7 +1013,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
position_ids = position_ids.view(-1, seq_length).long()
no_images = False
-
+
if image_hidden_states is None:
if pixel_values is None and image_embeddings is None:
raise ValueError("Either pixel_values and image_embeddings have to be not-None.")
@@ -1040,7 +1040,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
else:
- no_images = True
+ no_images = False
num_images = pixel_values.shape[1]
image_seq_len = image_hidden_states.shape[1] // num_images
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index bdf1969c..9adc44d1 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -24,35 +24,6 @@ from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sam
tracer = trace.get_tracer(__name__)
-# UTILS
-def base64_to_pil(encoded_image):
- decoded_image = base64.b64decode(encoded_image)
- pil_image = Image.open(BytesIO(decoded_image))
- return pil_image
-
-def im_markdown_to_pil(im_markdown_str):
- pattern = r'
'
- match = re.search(pattern, im_markdown_str)
- img_b64_str = match.group(1)
- return base64_to_pil(img_b64_str)
-
-def split_str_on_im_markdown(string_with_potential_im_markdown):
- """
- Extract from a string (typically the user prompt string) the potentional images saved as a base64 representation
- inside a markdown.
- """
- pattern = r'
'
- parts = re.split(pattern, string_with_potential_im_markdown)
- result = []
- for i, part in enumerate(parts):
- if i % 2 == 0:
- result.append(part)
- else:
- img_tag = f'
'
- result.append(img_tag)
- return result
-
-
@dataclass
class IdeficsCausalLMBatch(Batch):
batch_id: int
@@ -126,8 +97,8 @@ class IdeficsCausalLMBatch(Batch):
r.stopping_parameters, tokenizer
)
stopping_criterias.append(stopping_criteria)
- max_truncation = max(max_truncation, r.truncate) #TODO: understand that
- max_decode_tokens += stopping_criteria.max_new_tokens # TODO: I think it is just the maximum of tokens to generate in the WHOLE batch
+ max_truncation = max(max_truncation, r.truncate)
+ max_decode_tokens += stopping_criteria.max_new_tokens
padding_right_offset = max(
padding_right_offset, stopping_criteria.max_new_tokens
)
@@ -145,16 +116,6 @@ class IdeficsCausalLMBatch(Batch):
)
else:
raise ValueError("Unsupported type of input")
- # I initially wanted to send the images in string base64 but they are too big to send in a consistent way...
- # So resorting to uploading the image to a server and pulling them back
- # splitted_inp = split_str_on_im_markdown(inp)
- # prompts.append(
- # [
- # im_markdown_to_pil(s) if s.startswith(' else s
- # for s in splitted_inp
- # if s != )