Merge branch 'habana-main' into 2.3.0

2025-09-11 20:34:54 +00:00 · 2024-11-28 12:38:36 +08:00 · 2024-11-28 12:38:36 +08:00 · b83419a769
commit b83419a769
parent 636cdb4c43 d49ce00f40
2 changed files with 5 additions and 3 deletions
--- a/server/requirements.txt
+++ b/server/requirements.txt
@ -74,7 +74,6 @@ six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
 threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
-torch==2.4.0a0+git74cd574 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
 transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 transformers[sentencepiece]==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -495,8 +495,11 @@ class CausalLMBatch(Batch):
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
  
-        max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)
        max_input_length = max_truncation
+        if max_input_length < PAD_SEQUENCE_TO_MULTIPLE_OF:
+             max_input_length = PAD_SEQUENCE_TO_MULTIPLE_OF
+        max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)
+
        # TODO: by tokenizing all inputs at once we loose information on actual input lengths
        # this means that we cannot shift inputs to the left after a long input sequence
        # was filtered out