diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index ed6d9198..79344ea1 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -422,8 +422,6 @@ class FlashCausalLMBatch(Batch): block_tables_tensor = self.block_tables_tensor[indices] input_lengths_tensor = self.input_lengths_tensor[indices] slots = self.slots[slot_filtering_indices] - if slot_indices.max().item() > slots.shape[0]: - import ipdb;ipdb.set_trace() next_token_chooser = self.next_token_chooser.filter(indices) top_n_tokens_tensor = self.top_n_tokens_tensor[indices] speculative_ids = self.speculative_ids[indices] if self.speculative_ids is not None else None diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 71b7f48d..a93ccd0e 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -742,9 +742,6 @@ try: self._update_cos_sin_cache(dtype, position_ids.device, max_s) - if position_ids.max().item() >= max_s: - import ipdb;ipdb.set_trace() - cos = torch.index_select(self._cos_cached, 0, position_ids) sin = torch.index_select(self._sin_cached, 0, position_ids) # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.