From cbbc046a79f7fe747f3f5444de24a12e3657f406 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 28 Apr 2023 11:18:25 -0400 Subject: [PATCH] stuff --- Cargo.toml | 4 +- Dockerfile | 16 +- .../text_generation_server/models/__init__.py | 7 +- .../models/causal_lm.py | 6 + .../models/vectorized_causal_lm.py | 653 ++++++++++++++++++ server/text_generation_server/utils/tokens.py | 3 + 6 files changed, 683 insertions(+), 6 deletions(-) create mode 100644 server/text_generation_server/models/vectorized_causal_lm.py diff --git a/Cargo.toml b/Cargo.toml index af479c04..376911a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,10 +3,10 @@ members = [ "router", "router/client", "router/grpc-metadata", - "launcher" + "launcher", + "benchmark" ] exclude = [ - "benchmark" ] [profile.release] diff --git a/Dockerfile b/Dockerfile index c17a9414..ebfc0fab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY router router COPY launcher launcher +COPY benchmark benchmark RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder @@ -28,6 +29,7 @@ COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY router router COPY launcher launcher +COPY benchmark benchmark RUN cargo build --release # Python builder @@ -127,6 +129,9 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins libssl-dev \ ca-certificates \ make \ + git \ + git-lfs \ + vim \ && rm -rf /var/lib/apt/lists/* # Copy conda with PyTorch installed @@ -147,6 +152,7 @@ RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install e # Install server COPY proto proto COPY server server +COPY benchmark benchmark COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ @@ -157,6 +163,8 @@ RUN cd server && \ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router # Install launcher COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher +# Install benchmark +COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark # AWS Sagemaker compatbile image FROM base as sagemaker @@ -169,5 +177,11 @@ ENTRYPOINT ["./entrypoint.sh"] # Final image FROM base + + +ENV HUGGINGFACE_HUB_CACHE=/usr/data/.hf_cache/ +ENV PYTHONPATH=/usr/src/server/ +RUN chmod -R 777 /usr + ENTRYPOINT ["text-generation-launcher"] -CMD ["--json-output"] \ No newline at end of file +CMD ["--json-output"] diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 74a7483e..6f0d0769 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -1,3 +1,4 @@ +import os import torch from loguru import logger @@ -17,7 +18,7 @@ from text_generation_server.models.gpt_neox import GPTNeoxSharded from text_generation_server.models.t5 import T5Sharded try: - if torch.cuda.is_available(): + if torch.cuda.is_available() and os.environ.get("NO_FLASH_ATTENTION") is None: major, minor = torch.cuda.get_device_capability() is_sm75 = major == 7 and minor == 5 is_sm8x = major == 8 and minor >= 0 @@ -101,7 +102,7 @@ def get_model( else: return Galactica(model_id, revision, quantize=quantize) - if "bigcode" in model_id: + if "bigcode" in model_id and os.environ.get("NO_FAST_MODEL") is None: if sharded: if not FLASH_ATTENTION: raise NotImplementedError( @@ -112,7 +113,7 @@ def get_model( santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder return santacoder_cls(model_id, revision, quantize=quantize) - config = AutoConfig.from_pretrained(model_id, revision=revision) + config = AutoConfig.from_pretrained(model_id, revision=revision, trust_remote_code=True) model_type = config.model_type if model_type == "bloom": diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 7dc7fb85..73fa1930 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from opentelemetry import trace from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict +from loguru import logger from text_generation_server.models import Model from text_generation_server.models.types import ( @@ -53,6 +54,7 @@ class CausalLMBatch(Batch): keys_head_dim_last: bool = True def to_pb(self) -> generate_pb2.Batch: + #logger.info(f"to_pb, id={self.batch_id}, requests={self.requests}, size={len(self)}, max_tokens={self.max_tokens}") return generate_pb2.Batch( id=self.batch_id, requests=self.requests, @@ -67,6 +69,7 @@ class CausalLMBatch(Batch): tokenizer: PreTrainedTokenizerBase, device: torch.device, ) -> "CausalLMBatch": + #logger.info(f"from_pb, pb={pb}, tokenizer={tokenizer}, device={device}") inputs = [] next_token_choosers = [] stopping_criterias = [] @@ -141,6 +144,7 @@ class CausalLMBatch(Batch): @tracer.start_as_current_span("filter") def filter(self, requests: List[generate_pb2.Request]) -> Optional["CausalLMBatch"]: + logger.info(f"filter, requests={requests}") if len(requests) == 0: raise ValueError("Batch must have at least one request") if len(requests) == len(self): @@ -238,6 +242,7 @@ class CausalLMBatch(Batch): @classmethod @tracer.start_as_current_span("concatenate") def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch": + logger.info(f"concatenate, batches={batches}") # Used for padding total_batch_size = 0 max_input_length = 0 @@ -469,6 +474,7 @@ class CausalLM(Model): torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None, load_in_8bit=quantize, + trust_remote_code=True, ).eval() tokenizer.pad_token_id = ( self.model.config.pad_token_id diff --git a/server/text_generation_server/models/vectorized_causal_lm.py b/server/text_generation_server/models/vectorized_causal_lm.py new file mode 100644 index 00000000..73fa1930 --- /dev/null +++ b/server/text_generation_server/models/vectorized_causal_lm.py @@ -0,0 +1,653 @@ +import torch + +from dataclasses import dataclass +from opentelemetry import trace +from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase +from typing import Optional, Tuple, List, Type, Dict +from loguru import logger + +from text_generation_server.models import Model +from text_generation_server.models.types import ( + Batch, + PrefillTokens, + Generation, + GeneratedText, +) +from text_generation_server.pb import generate_pb2 +from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling + +tracer = trace.get_tracer(__name__) + + +@dataclass +class CausalLMBatch(Batch): + batch_id: int + requests: List[generate_pb2.Request] + requests_idx_mapping: Dict[int, int] + + # Decoder values + input_ids: torch.Tensor + attention_mask: torch.Tensor + position_ids: torch.Tensor + past_key_values: Optional[List[Tuple]] + + # All tokens + all_input_ids: List[torch.Tensor] + + # Lengths of all generations present in the batch + input_lengths: List[int] + offsets: List[Optional[int]] + token_offsets: List[Optional[int]] + + # Generation helpers + next_token_choosers: List[NextTokenChooser] + stopping_criterias: List[StoppingCriteria] + + # Metadata used for padding + max_input_length: int + padding_right_offset: int + + # Maximum number of tokens this batch will grow to + max_tokens: int + + # Past metadata + keys_head_dim_last: bool = True + + def to_pb(self) -> generate_pb2.Batch: + #logger.info(f"to_pb, id={self.batch_id}, requests={self.requests}, size={len(self)}, max_tokens={self.max_tokens}") + return generate_pb2.Batch( + id=self.batch_id, + requests=self.requests, + size=len(self), + max_tokens=self.max_tokens, + ) + + @classmethod + def from_pb( + cls, + pb: generate_pb2.Batch, + tokenizer: PreTrainedTokenizerBase, + device: torch.device, + ) -> "CausalLMBatch": + #logger.info(f"from_pb, pb={pb}, tokenizer={tokenizer}, device={device}") + inputs = [] + next_token_choosers = [] + stopping_criterias = [] + offsets = [] + token_offsets = [] + requests_idx_mapping = {} + + # Parse batch + max_truncation = 0 + padding_right_offset = 0 + max_decode_tokens = 0 + for i, r in enumerate(pb.requests): + requests_idx_mapping[r.id] = i + inputs.append(r.inputs) + offsets.append(None) + token_offsets.append(None) + next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device)) + stopping_criteria = StoppingCriteria.from_pb( + r.stopping_parameters, tokenizer + ) + stopping_criterias.append(stopping_criteria) + max_truncation = max(max_truncation, r.truncate) + max_decode_tokens += stopping_criteria.max_new_tokens + padding_right_offset = max( + padding_right_offset, stopping_criteria.max_new_tokens + ) + + tokenized_inputs = tokenizer( + inputs, + return_tensors="pt", + padding=True, + return_token_type_ids=False, + truncation=True, + max_length=max_truncation, + ).to(device) + + input_lengths = tokenized_inputs["attention_mask"].sum(1) + max_input_length = input_lengths.max() + + input_ids = tokenized_inputs["input_ids"] + # Allocate maximum attention_mask + attention_mask = input_ids.new_zeros( + (pb.size, max_input_length + padding_right_offset) + ) + # Copy tokenizer attention_mask into fully allocated attention_mask + attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"] + + position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1 + position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1) + all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1) + + max_tokens = len(inputs) * max_input_length + max_decode_tokens + + return cls( + batch_id=pb.id, + requests=pb.requests, + requests_idx_mapping=requests_idx_mapping, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=None, + all_input_ids=list(all_input_ids), + input_lengths=input_lengths.tolist(), + offsets=offsets, + token_offsets=token_offsets, + next_token_choosers=next_token_choosers, + stopping_criterias=stopping_criterias, + max_input_length=max_input_length.item(), + padding_right_offset=padding_right_offset, + max_tokens=max_tokens, + ) + + @tracer.start_as_current_span("filter") + def filter(self, requests: List[generate_pb2.Request]) -> Optional["CausalLMBatch"]: + logger.info(f"filter, requests={requests}") + if len(requests) == 0: + raise ValueError("Batch must have at least one request") + if len(requests) == len(self): + return self + + keep_indices = [] + + # New values after filtering + requests_idx_mapping = {} + input_lengths = [] + offsets = [] + token_offsets = [] + all_input_ids = [] + max_input_length = 0 + + next_token_choosers = [] + stopping_criterias = [] + + total_remaining_decode_tokens = 0 + new_padding_right_offset = 0 + + for i, r in enumerate(requests): + idx = self.requests_idx_mapping[r.id] + requests_idx_mapping[r.id] = i + keep_indices.append(idx) + + offsets.append(self.offsets[idx]) + token_offsets.append(self.token_offsets[idx]) + all_input_ids.append(self.all_input_ids[idx]) + + request_input_length = self.input_lengths[idx] + input_lengths.append(request_input_length) + max_input_length = max(max_input_length, request_input_length) + + next_token_choosers.append(self.next_token_choosers[idx]) + stopping_criteria = self.stopping_criterias[idx] + stopping_criterias.append(stopping_criteria) + remaining_decode_tokens = ( + stopping_criteria.max_new_tokens - stopping_criteria.current_tokens + ) + total_remaining_decode_tokens += remaining_decode_tokens + new_padding_right_offset = max( + new_padding_right_offset, remaining_decode_tokens + ) + + # Apply indices to input_ids, attention mask, past key values and other items that need to be cached + input_ids = self.input_ids[keep_indices] + position_ids = self.position_ids[keep_indices] + self.attention_mask = self.attention_mask[ + keep_indices, + -(self.padding_right_offset + max_input_length) : ( + self.attention_mask.shape[1] - self.padding_right_offset + ) + + new_padding_right_offset, + ] + + # Ensure that past_key_values tensors can be updated in-place + if type(self.past_key_values[0]) == tuple: + self.past_key_values = [list(layer) for layer in self.past_key_values] + + # Update tensors in-place to allow incremental garbage collection + past_kv_length = max_input_length - 1 + for layer in self.past_key_values: + past_keys, past_values = layer + if len(past_keys.shape) == 3: + # Force past to be of dim [self_size, num_heads, ...] for easy indexing + past_keys = past_keys.view(len(self), -1, *past_keys.shape[-2:]) + past_values = past_values.view(len(self), -1, *past_values.shape[-2:]) + if self.keys_head_dim_last: + layer[0] = past_keys[keep_indices, :, -past_kv_length:, :] + else: + layer[0] = past_keys[keep_indices, :, :, -past_kv_length:] + del past_keys + layer[1] = past_values[keep_indices, :, -past_kv_length:, :] + del past_values + + max_tokens = len(requests) * max_input_length + total_remaining_decode_tokens + + self.requests = requests + self.requests_idx_mapping = requests_idx_mapping + self.input_ids = input_ids + self.position_ids = position_ids + self.all_input_ids = all_input_ids + self.input_lengths = input_lengths + self.offsets = offsets + self.token_offsets = token_offsets + self.next_token_choosers = next_token_choosers + self.stopping_criterias = stopping_criterias + self.max_input_length = max_input_length + self.padding_right_offset = new_padding_right_offset + self.max_tokens = max_tokens + + return self + + @classmethod + @tracer.start_as_current_span("concatenate") + def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch": + logger.info(f"concatenate, batches={batches}") + # Used for padding + total_batch_size = 0 + max_input_length = 0 + padding_right_offset = 0 + for batch in batches: + total_batch_size += len(batch) + max_input_length = max(max_input_length, batch.max_input_length) + padding_right_offset = max(padding_right_offset, batch.padding_right_offset) + + # Batch attributes + requests = [] + requests_idx_mapping = {} + input_lengths = [] + offsets = [] + token_offsets = [] + all_input_ids = [] + next_token_choosers = [] + stopping_criterias = [] + max_tokens = 0 + + # Batch tensors + input_ids = None + attention_mask = None + position_ids = None + past_key_values = [] + + # Used for slicing correctly inside the tensors + # Equivalent to a cumsum on batch sizes + start_index = 0 + for i, batch in enumerate(batches): + requests.extend(batch.requests) + input_lengths.extend(batch.input_lengths) + offsets.extend(batch.offsets) + token_offsets.extend(batch.token_offsets) + all_input_ids.extend(batch.all_input_ids) + next_token_choosers.extend(batch.next_token_choosers) + stopping_criterias.extend(batch.stopping_criterias) + + if i == 0: + requests_idx_mapping = batch.requests_idx_mapping + else: + # We need to offset the mapping for each batch by the cumulative batch size + for k, v in batch.requests_idx_mapping.items(): + requests_idx_mapping[k] = v + start_index + + # Slicing end index for this batch + end_index = start_index + len(batch) + + # We only concatenate batches that did at least one step + if batch.past_key_values is None: + raise ValueError("only concatenate prefilled batches") + + # Create empty tensor + # input_ids is always of shape [batch_size, 1] + # We do not need to pad it + if input_ids is None: + input_ids = batch.input_ids.new_empty((total_batch_size, 1)) + # Copy to correct indices + input_ids[start_index:end_index] = batch.input_ids + + # Create padded tensor + if attention_mask is None: + attention_mask = batch.attention_mask.new_zeros( + (total_batch_size, max_input_length + padding_right_offset), + ) + + # We need to slice the attention mask to remove padding from previous steps + # and to remove unused allocated space + left_offset = max_input_length - batch.max_input_length + batch_left_offset = ( + batch.attention_mask.shape[1] + - batch.max_input_length + - batch.padding_right_offset + ) + attention_mask[ + start_index:end_index, + left_offset:-padding_right_offset, + ] = batch.attention_mask[ + :, + batch_left_offset : -batch.padding_right_offset, + ] + + # Create empty tensor + # position_ids is always of shape [batch_size, 1] + if position_ids is None: + position_ids = batch.position_ids.new_empty((total_batch_size, 1)) + position_ids[start_index:end_index] = batch.position_ids + + # Shenanigans to get dimensions because BLOOM outputs a past with a different shape + # BLOOM Keys: [batch_size * num_heads, head_dim, seq_length] + # BLOOM Values: [batch_size * num_heads, seq_length, head_dim] + # And ensure that we can update tensors in-place + if type(batch.past_key_values[0]) == tuple: + batch.past_key_values = [ + [t.view(len(batch), -1, *t.shape[-2:]) for t in layer] + for layer in batch.past_key_values + ] + elif len(batch.past_key_values[0][0].shape) == 3: + for layer in batch.past_key_values: + for k, t in enumerate(layer): + layer[k] = t.view(len(batch), -1, *t.shape[-2:]) + + # Add eventual padding tokens that were added while concatenating + max_tokens += batch.max_tokens + ( + max_input_length - batch.max_input_length + ) * len(batch) + + start_index = end_index + + first_past_kvs = batches[0].past_key_values + _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape + + padded_past_values_shape = ( + total_batch_size, + num_heads, + max_input_length - 1, + head_dim, + ) + + if batches[0].keys_head_dim_last: + padded_past_keys_shape = padded_past_values_shape + else: + # seq_length is last for BLOOM + padded_past_keys_shape = ( + total_batch_size, + num_heads, + head_dim, + max_input_length - 1, + ) + + # Iterate over attention layers + # Concatenate past key values layer by layer to allow incremental garbage collection + for j in range(len(first_past_kvs)): + padded_past_keys = first_past_kvs[j][0].new_zeros(padded_past_keys_shape) + start_index = 0 + for batch in batches: + past_keys = batch.past_key_values[j][0] + # Clear reference to the original tensor + batch.past_key_values[j][0] = None + + # Slicing end index for this batch + end_index = start_index + len(batch) + # We slice the keys to remove the padding from previous batches + past_seq_len = batch.max_input_length - 1 + if batch.keys_head_dim_last: + padded_past_keys[ + start_index:end_index, :, -past_seq_len:, : + ] = past_keys[:, :, -past_seq_len:, :] + else: + # BLOOM case + padded_past_keys[ + start_index:end_index, :, :, -past_seq_len: + ] = past_keys[:, :, :, -past_seq_len:] + del past_keys + + start_index = end_index + + padded_past_values = first_past_kvs[j][1].new_zeros( + padded_past_values_shape + ) + start_index = 0 + for batch in batches: + past_values = batch.past_key_values[j][1] + # Clear reference to the original tensor + batch.past_key_values[j][1] = None + + # Slicing end index for this batch + end_index = start_index + len(batch) + # We slice the past values to remove the padding from previous batches + past_seq_len = batch.max_input_length - 1 + padded_past_values[ + start_index:end_index, :, -past_seq_len:, : + ] = past_values[:, :, -past_seq_len:, :] + del past_values + + # Update values + start_index = end_index + + past_key_values.append([padded_past_keys, padded_past_values]) + + return cls( + batch_id=batches[0].batch_id, + requests=requests, + requests_idx_mapping=requests_idx_mapping, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + all_input_ids=all_input_ids, + input_lengths=input_lengths, + offsets=offsets, + token_offsets=token_offsets, + next_token_choosers=next_token_choosers, + stopping_criterias=stopping_criterias, + max_input_length=max_input_length, + padding_right_offset=padding_right_offset, + keys_head_dim_last=batches[0].keys_head_dim_last, + max_tokens=max_tokens, + ) + + def __len__(self): + return len(self.requests) + + +class CausalLM(Model): + def __init__( + self, + model_id: str, + revision: Optional[str] = None, + quantize: bool = False, + decode_buffer: int = 3, + ): + if torch.cuda.is_available(): + device = torch.device("cuda") + dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32 + else: + if quantize: + raise ValueError("quantization is not available on CPU") + + device = torch.device("cpu") + dtype = torch.float32 + + tokenizer = AutoTokenizer.from_pretrained( + model_id, revision=revision, padding_side="left", truncation_side="left" + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + revision=revision, + torch_dtype=dtype, + device_map="auto" if torch.cuda.is_available() else None, + load_in_8bit=quantize, + trust_remote_code=True, + ).eval() + tokenizer.pad_token_id = ( + self.model.config.pad_token_id + if self.model.config.pad_token_id is not None + else self.model.config.eos_token_id + ) + + super(CausalLM, self).__init__( + tokenizer=tokenizer, + requires_padding=True, + dtype=dtype, + device=device, + decode_buffer=decode_buffer, + ) + + @property + def batch_type(self) -> Type[CausalLMBatch]: + return CausalLMBatch + + def decode(self, generated_ids: List[int]) -> str: + return self.tokenizer.decode( + generated_ids, skip_special_tokens=True, cleanup_tokenization_spaces=False + ) + + def forward( + self, input_ids, attention_mask, position_ids, past_key_values: Optional = None + ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: + # Model Forward + outputs = self.model.forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=True, + ) + return outputs.logits, outputs.past_key_values + + @tracer.start_as_current_span("generate_token") + def generate_token( + self, batch: CausalLMBatch + ) -> Tuple[List[Generation], Optional[CausalLMBatch]]: + # slice the attention mask to the correct shape + attention_mask = batch.attention_mask[:, : -batch.padding_right_offset] + + logits, past = self.forward( + batch.input_ids, + attention_mask, + batch.position_ids, + batch.past_key_values, + ) + + # Results + generations: List[Generation] = [] + stopped = True + + # Zipped iterator + iterator = zip( + batch.requests, + batch.input_lengths, + batch.offsets, + batch.token_offsets, + logits, + batch.next_token_choosers, + batch.stopping_criterias, + batch.all_input_ids, + ) + + # For each member of the batch + for i, ( + request, + input_length, + offset, + token_offset, + logits, + next_token_chooser, + stopping_criteria, + all_input_ids, + ) in enumerate(iterator): + # Select next token + next_token_id, logprobs = next_token_chooser( + all_input_ids.view(1, -1), logits + ) + + # Append next token to all tokens + all_input_ids = torch.cat([all_input_ids, next_token_id]) + new_input_length = input_length + 1 + + # Generated token + next_token_logprob = logprobs[-1, next_token_id] + next_token_id_squeezed = next_token_id.squeeze() + next_token_text, offset, token_offset = self.decode_token( + all_input_ids[:, 0], offset, token_offset + ) + + # Evaluate stopping criteria + stop, reason = stopping_criteria( + next_token_id_squeezed, + next_token_text, + ) + + if stop: + # Decode generated tokens + output_text = self.decode( + all_input_ids[-stopping_criteria.current_tokens :, 0] + ) + # Get seed + if isinstance(next_token_chooser.choice, Sampling): + seed = next_token_chooser.choice.seed + else: + seed = None + + generated_text = GeneratedText( + output_text, stopping_criteria.current_tokens, reason, seed + ) + else: + # Keep request in the batch + generated_text = None + stopped = False + + # Prefill + if stopping_criteria.current_tokens == 1: + # Remove generated token to only have prefill and add nan for first prompt token + prefill_logprobs = [float("nan")] + logprobs.gather( + 1, all_input_ids[1:] + ).squeeze(1)[-new_input_length:-1].tolist() + prefill_token_ids = all_input_ids[-new_input_length:-1] + prefill_texts = self.tokenizer.batch_decode( + prefill_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + prefill_tokens = PrefillTokens( + prefill_token_ids, prefill_logprobs, prefill_texts + ) + else: + prefill_tokens = None + + generation = Generation( + request.id, + prefill_tokens, + next_token_id_squeezed, + next_token_logprob, + next_token_text, + next_token_id_squeezed.item() in self.all_special_ids, + generated_text, + ) + + generations.append(generation) + + # Update values + batch.input_ids[i, 0] = next_token_id + batch.all_input_ids[i] = all_input_ids + batch.input_lengths[i] = new_input_length + batch.offsets[i] = offset + batch.token_offsets[i] = token_offset + batch.max_input_length = max(batch.max_input_length, new_input_length) + + # We finished all generations in the batch; there is no next batch + if stopped: + return generations, None + + # Slice unused values from prefill + batch.input_ids = batch.input_ids[:, :1] + + # Update attention_mask as we added a new token to input_ids + batch.attention_mask[:, -batch.padding_right_offset] = 1 + # Decrease right offset + batch.padding_right_offset -= 1 + + # Update position_ids + batch.position_ids = batch.position_ids[:, -1:] + 1 + + # Update past key values + batch.past_key_values = past + + return generations, batch diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py index 23f504c6..08e090b4 100644 --- a/server/text_generation_server/utils/tokens.py +++ b/server/text_generation_server/utils/tokens.py @@ -1,5 +1,7 @@ import re import torch +from loguru import logger + from transformers import ( LogitsProcessorList, @@ -47,6 +49,7 @@ class NextTokenChooser: seed=0, device="cpu", ): + #logger.info(f"AAAA {watermark} {temperature} {repetition_penalty} {top_k} {top_p} {typical_p} {do_sample} {seed} {device}") warpers = LogitsProcessorList() # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files # all samplers can be found in `generation_utils_samplers.py`