From 83eadbb2569c531ad396f492f00ef5ce2c96691f Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 23 May 2025 08:33:12 +0000 Subject: [PATCH] fix(neuron): use neuron_config whenever possible --- .../text_generation_server/generator.py | 30 +++++++++++++------ .../tests/server/test_continuous_batching.py | 4 +-- backends/neuron/tests/server/test_decode.py | 2 +- backends/neuron/tests/server/test_prefill.py | 8 ++--- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/backends/neuron/server/text_generation_server/generator.py b/backends/neuron/server/text_generation_server/generator.py index 1e476819..77746512 100644 --- a/backends/neuron/server/text_generation_server/generator.py +++ b/backends/neuron/server/text_generation_server/generator.py @@ -344,7 +344,9 @@ class NeuronGenerator(Generator): tokenizer.truncation_side = "left" self.tokenizer = tokenizer self.special_tokens = self.tokenizer.all_special_ids - self.slots = [Slot(i, tokenizer) for i in range(self.model.batch_size)] + self.slots = [ + Slot(i, tokenizer) for i in range(self.model.neuron_config.batch_size) + ] self.batch_id = 0 @property @@ -368,14 +370,22 @@ class NeuronGenerator(Generator): The maximum number of tokens the model supports. """ # Just check that the warmup request parameters match the model capacity - batch_size = self.model.batch_size + batch_size = self.model.neuron_config.batch_size if len(batch.requests) > batch_size: raise ValueError( - f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI. The compiled model batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process. The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE." + f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI. The compiled model.neuron_config.batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process. The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE." ) self.prefill(batch) self.clear() - return self.model.batch_size * self.model.max_length + return ( + self.model.neuron_config.batch_size + * self.model.neuron_config.sequence_length + ) + + def max_prefill_length(self) -> int: + if hasattr(self.model.neuron_config, "max_context_length"): + return self.model.neuron_config.max_context_length + return self.model.neuron_config.sequence_length def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]: """Prefill new requests. @@ -395,7 +405,7 @@ class NeuronGenerator(Generator): if len(empty_slots) < len(batch.requests): raise ValueError( f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots." - f" Please align max_batch_size with the static batch size: {self.model.batch_size}." + f" Please align max_batch_size with the static batch size: {self.model.neuron_config.batch_size}." ) # Assign each request to an empty slot logger.debug( @@ -422,8 +432,10 @@ class NeuronGenerator(Generator): inputs.append(slot.cached_text) # Apply truncation, making sure we fit into static dimensions if slot.truncate == 0: - max_length = self.model.max_length - elif slot.truncate > max_length and slot.truncate < self.model.max_length: + max_length = self.max_prefill_length() + elif ( + slot.truncate > max_length and slot.truncate < self.max_prefill_length() + ): max_length = slot.truncate # Tokenize with padding and truncation padded_inputs = self.tokenizer( @@ -451,7 +463,7 @@ class NeuronGenerator(Generator): slot_input_ids, slot.generation_config, self.model, - self.model.max_length, + self.model.neuron_config.sequence_length, tokenizer=self.tokenizer, seed=slot.seed, ) @@ -602,7 +614,7 @@ class NeuronGenerator(Generator): def _cached_batch(self, batch_id: int, request_ids: List): size = len(request_ids) - max_tokens = size * self.model.max_length + max_tokens = size * self.model.neuron_config.sequence_length return CachedBatch( id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens ) diff --git a/backends/neuron/tests/server/test_continuous_batching.py b/backends/neuron/tests/server/test_continuous_batching.py index 48bb70cc..3d9ab509 100644 --- a/backends/neuron/tests/server/test_continuous_batching.py +++ b/backends/neuron/tests/server/test_continuous_batching.py @@ -9,13 +9,13 @@ def test_continuous_batching_two_requests(neuron_model_config): """ neuron_model_path = neuron_model_config["neuron_model_path"] generator = NeuronGenerator.from_pretrained(neuron_model_path) - assert generator.model.batch_size > 1 + assert generator.model.neuron_config.batch_size > 1 input_text = "Once upon a time" max_new_tokens = 20 # Prefill a single request, remembering the generated token tokens = {0: [], 1: []} request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens) - max_length = generator.model.max_length + max_length = generator.model.neuron_config.sequence_length batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) generations, next_batch = generator.prefill(batch) assert next_batch.size == 1 diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py index 9db5e3ab..377cbb23 100644 --- a/backends/neuron/tests/server/test_decode.py +++ b/backends/neuron/tests/server/test_decode.py @@ -23,7 +23,7 @@ def _test_decode(config_name, generator, do_sample): request = create_request( id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample ) - max_length = generator.model.max_length + max_length = generator.model.neuron_config.sequence_length batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) generations, next_batch = generator.prefill(batch) # We already generated one token: call decode max_new_tokens - 1 times diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index c0155b1a..371946d9 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -9,7 +9,7 @@ def test_prefill(neuron_model_config): neuron_model_path = neuron_model_config["neuron_model_path"] generator = NeuronGenerator.from_pretrained(neuron_model_path) max_batch_size = 4 - assert generator.model.batch_size >= max_batch_size + assert generator.model.neuron_config.batch_size >= max_batch_size for num_requests in [1, max_batch_size]: for do_sample in [True, False]: mode = "sample" if do_sample else "greedy" @@ -34,7 +34,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample): ) ) # Let's be pessimistic when estimating max_tokens - max_length = generator.model.max_length + max_length = generator.max_prefill_length() batch = Batch( id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length ) @@ -70,7 +70,7 @@ def test_prefill_truncate(neuron_model_config): config_name = neuron_model_config["name"] neuron_model_path = neuron_model_config["neuron_model_path"] generator = NeuronGenerator.from_pretrained(neuron_model_path) - batch_size = generator.model.batch_size + batch_size = generator.model.neuron_config.batch_size # We apply truncation to all requests but the first one truncate = [ None, @@ -83,7 +83,7 @@ def test_prefill_truncate(neuron_model_config): requests = [] for i in range(batch_size): requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i])) - max_length = generator.model.max_length + max_length = generator.max_prefill_length() batch = Batch( id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length )