minor fix

2025-09-10 03:44:54 +00:00 · 2025-04-30 08:55:38 +00:00 · 2025-04-30 08:55:38 +00:00 · d1cf64abc4
commit d1cf64abc4
parent 5cfd4b168a
3 changed files with 2 additions and 3 deletions
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@ -39,7 +39,7 @@ httpcore==1.0.7
    # via httpx
 httpx==0.28.1
    # via openai
-huggingface-hub==0.30.1
+huggingface-hub==0.29.3
    # via
    #   text-generation-integration-tests (pyproject.toml)
    #   text-generation
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@ -661,7 +661,7 @@ class BloomModel(BloomPreTrainedModel):

        return combined_attention_mask

-    def set_inputs_embeds(self, new_embeddings: torch.Tensor):
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
        self.word_embeddings = new_embeddings

    def forward(
--- a/server/text_generation_server/models/custom_modeling/mllama.py
+++ b/server/text_generation_server/models/custom_modeling/mllama.py
@ -959,7 +959,6 @@ class MllamaForConditionalGeneration(nn.Module):
        # XXX: Putting these as optional so that the cuda warmup calls can go through.
        cross_attention_states: Optional[torch.Tensor] = None,
        image_indices=None,
-        inputs_embeds=None,
    ):
        if cross_attention_states is not None:
            seqlen_q = len(image_indices)