diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt index fe4e929f..ca2dee93 100644 --- a/integration-tests/requirements.txt +++ b/integration-tests/requirements.txt @@ -39,7 +39,7 @@ httpcore==1.0.7 # via httpx httpx==0.28.1 # via openai -huggingface-hub==0.30.1 +huggingface-hub==0.29.3 # via # text-generation-integration-tests (pyproject.toml) # text-generation diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py index 7c8a6926..84835ab8 100644 --- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py +++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py @@ -661,7 +661,7 @@ class BloomModel(BloomPreTrainedModel): return combined_attention_mask - def set_inputs_embeds(self, new_embeddings: torch.Tensor): + def set_input_embeddings(self, new_embeddings: torch.Tensor): self.word_embeddings = new_embeddings def forward( diff --git a/server/text_generation_server/models/custom_modeling/mllama.py b/server/text_generation_server/models/custom_modeling/mllama.py index 7d60c098..be0a4b5d 100644 --- a/server/text_generation_server/models/custom_modeling/mllama.py +++ b/server/text_generation_server/models/custom_modeling/mllama.py @@ -959,7 +959,6 @@ class MllamaForConditionalGeneration(nn.Module): # XXX: Putting these as optional so that the cuda warmup calls can go through. cross_attention_states: Optional[torch.Tensor] = None, image_indices=None, - inputs_embeds=None, ): if cross_attention_states is not None: seqlen_q = len(image_indices)