feat: first draft constraining generation via outlines

2025-09-11 12:24:53 +00:00 · 2024-02-01 23:13:34 +00:00 · 2024-02-01 23:13:34 +00:00 · b013cb4f4a
commit b013cb4f4a
parent 4c2848b24b
2 changed files with 74 additions and 3 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -87,7 +87,7 @@ class CausalLMBatch(Batch):
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            inputs.append(r.inputs)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
+            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer))
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@ -1,5 +1,7 @@
 import re
-from typing import List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, DefaultDict
 from collections import defaultdict
 import math
 import torch
 from text_generation_server.pb import generate_pb2
@ -18,6 +20,7 @@ from text_generation_server.utils.logits_process import (
 from text_generation_server.utils.watermark import WatermarkLogitsProcessor
 from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
 from outlines.fsm.fsm import RegexFSM
 class NextTokenChooser:
    def __init__(
@ -32,6 +35,7 @@ class NextTokenChooser:
        do_sample=False,
        seed=0,
        device="cpu",
        tokenizer=None,
    ):
        self.watermark_processor = (
            WatermarkLogitsProcessor(device=device) if watermark else None
@ -46,6 +50,7 @@ class NextTokenChooser:
            if frequency_penalty and frequency_penalty != 0.0
            else None
        )
        self.tokenizer = tokenizer
        has_warpers = (
            (temperature is not None and temperature != 1.0)
@ -61,7 +66,9 @@ class NextTokenChooser:
            self.static_warper = None
        sampling = do_sample or has_warpers
-        self.choice = Sampling(seed, device) if sampling else Greedy()
+        # TODO toggle grammar
        # self.choice = Sampling(seed, device) if sampling else Greedy()
        self.choice = Grammar(tokenizer, device)
    def __call__(self, input_ids, scores):
        if self.watermark_processor is not None:
@ -85,6 +92,7 @@ class NextTokenChooser:
        cls,
        pb: generate_pb2.NextTokenChooserParameters,
        device: torch.device,
        tokenizer: PreTrainedTokenizerBase,
    ) -> "NextTokenChooser":
        return NextTokenChooser(
            watermark=pb.watermark,
@ -97,6 +105,7 @@ class NextTokenChooser:
            do_sample=pb.do_sample,
            seed=pb.seed,
            device=device,
            tokenizer=tokenizer,
        )
@ -419,6 +428,68 @@ class Greedy:
    def __call__(self, logits):
        return logits.argmax(dim=-1)
 # TODO: move this whole thing into the logit_process util and make it a Sampler
 class Grammar:
    fsm_state: DefaultDict[int, int]
    fsm: RegexFSM
    def __init__(self, tokenizer, device):
        # TODO: get regex on init not hardcoded
        regex_str = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
        # TODO: adapt tokenizer is expensive, we should do it only once
        # this is a temporary solution
        tokenizer = self.adapt_tokenizer(tokenizer)
        fsm = RegexFSM(regex_str, tokenizer)
        self.fsm = fsm
        self.fsm_state = defaultdict(int)
    def __call__(self, logits):
        # TODO: handle seq_id properly
        seq_id = 0
        if self.fsm_state[seq_id] == -1:
            return self.fsm_state[seq_id].eos_token_id
        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
        mask = torch.full((logits.shape[-1],), -math.inf, device=logits.device)
        mask[allowed_tokens] = 0
        biased_scores = logits + mask
        # greedly pick the token with the highest score
        greedy = biased_scores.argmax(dim=-1)
        # now update the fsm state
        self.fsm_state[seq_id] = self.fsm.next_state(
            self.fsm_state[seq_id], greedy.item()
        )
        return greedy
    def adapt_tokenizer(self, tokenizer):
        """Adapt tokenizer to work with the FSM.
        The API of Outlines tokenizers is slightly different to that of
        `transformers`. In addition we need to handle the missing spaces to
        Llama's tokenizer to be able to compile FSMs for this model.
        """
        tokenizer.vocabulary = tokenizer.get_vocab()
        tokenizer.special_tokens = set(tokenizer.all_special_tokens)
        def convert_token_to_string(token: str) -> str:
            from transformers.file_utils import SPIECE_UNDERLINE
            string = tokenizer.convert_tokens_to_string([token])
            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string
            return string
        tokenizer.convert_token_to_string = convert_token_to_string
        return tokenizer
 class HeterogeneousSampling:
    r"""