fix(server): fix warpers on CPU

2025-09-10 11:54:52 +00:00 · 2023-06-19 17:44:53 +02:00 · 2023-06-19 17:44:53 +02:00 · 82c9fadefe
commit 82c9fadefe
parent f59fb8b630
2 changed files with 27 additions and 29 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -230,20 +230,12 @@ def get_model(
        )
    elif model_type == "t5":
        if sharded:
        return T5Sharded(
            model_id,
            revision,
            quantize=quantize,
            trust_remote_code=trust_remote_code,
        )
        else:
            return Seq2SeqLM(
                model_id,
                revision,
                quantize=quantize,
                trust_remote_code=trust_remote_code,
            )
    if sharded:
        raise ValueError("sharded is not supported for AutoModel")
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@ -42,6 +42,7 @@ class StaticWarper:
        self.static_next_logprob = None
    def __call__(self, scores):
        if torch.cuda.is_available():
            if self.cuda_graph is None:
                self.static_scores = scores
                self.cuda_graph = torch.cuda.CUDAGraph()
@ -62,6 +63,11 @@ class StaticWarper:
            return self.static_warped_scores, self.static_next_logprob
        # CPU branch
        for warper in self.warpers:
            scores = warper(None, scores)
        return scores, torch.log_softmax(scores, -1)
@lru_cache(10)
 def static_warper(