fix: run linters and fix formatting (#3057)

2025-07-08 00:40:16 +00:00 · 2025-02-25 16:11:34 -05:00 · 2025-02-25 16:11:34 -05:00 · b0069e0485
commit b0069e0485
parent d7a24c03cf
16 changed files with 366 additions and 103 deletions
--- a/backends/neuron/server/text_generation_server/cli.py
+++ b/backends/neuron/server/text_generation_server/cli.py
@ -61,7 +61,9 @@ def serve(
    )
    if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
            "'trust_remote_code' argument is not supported and will be ignored."
        )
    # Import here after the logger is added to log potential import exceptions
    from .server import serve
@ -99,7 +101,9 @@ def download_weights(
    if extension is not None:
        logger.warning("'extension' argument is not supported and will be ignored.")
    if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
            "'trust_remote_code' argument is not supported and will be ignored."
        )
    if auto_convert is not None:
        logger.warning("'auto_convert' argument is not supported and will be ignored.")
    if merge_lora is not None:
--- a/backends/neuron/server/text_generation_server/generator.py
+++ b/backends/neuron/server/text_generation_server/generator.py
@ -146,7 +146,9 @@ class Slot:
    def generated_tokens(self) -> int:
        return self._generated_tokens
-    def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
+    def assign(
        self, batch_id: int, request: Request, generation_config: GenerationConfig
    ):
        """Assign a request to a slot.
        Args:
@ -174,15 +176,24 @@ class Slot:
            if request.parameters.typical_p != 0:
                self._generation_config.typical_p = request.parameters.typical_p
        if request.parameters.repetition_penalty != 0:
-            self._generation_config.repetition_penalty = request.parameters.repetition_penalty
+            self._generation_config.repetition_penalty = (
                request.parameters.repetition_penalty
            )
        self.seed = request.parameters.seed
-        self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
+        self._generation_config.max_new_tokens = (
            request.stopping_parameters.max_new_tokens
        )
        self._max_new_tokens = self._generation_config.max_new_tokens
        stop_strings = request.stopping_parameters.stop_sequences
        if stop_strings:
            self._generation_config.stop_strings = stop_strings
-    def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
+    def reset(
        self,
        input_ids: torch.LongTensor,
        attention_mask: torch.LongTensor,
        selector: TokenSelector,
    ):
        """Reset the slot for the next generation.
        Args:
@ -210,7 +221,9 @@ class Slot:
            self._generated_tokens -= 1
            # Since generated tokens are now part of the prefill, we need to reevaluate
            # max_new_tokens for the next generation
-            self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
+            self._generation_config.max_new_tokens = (
                self._max_new_tokens - self._generated_tokens
            )
        self._state = Slot.State.PAUSE
    def resume(self):
@ -223,7 +236,9 @@ class Slot:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
        # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
-        new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
+        new_text = self._tokenizer.decode(
            self._tokens[self._next_text_token_start :], skip_special_tokens=False
        )
        if new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
@ -267,7 +282,9 @@ class Slot:
        self._next_text = next_text
        return next_text
-    def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
+    def select(
        self, input_ids: torch.LongTensor, logits: torch.Tensor
    ) -> torch.LongTensor:
        """Select the next token from the candidate logits.
        Args:
@ -384,7 +401,9 @@ class NeuronGenerator(Generator):
                f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
            )
        # Assign each request to an empty slot
-        logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
+        logger.debug(
            f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
        )
        new_slots = []
        for request in batch.requests:
            slot = empty_slots.pop()
@ -417,7 +436,11 @@ class NeuronGenerator(Generator):
                max_length = slot.truncate
        # Tokenize with padding and truncation
        padded_inputs = self.tokenizer(
-            inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+            inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        )
        input_ids = padded_inputs.input_ids
        attention_mask = padded_inputs.attention_mask
@ -450,9 +473,13 @@ class NeuronGenerator(Generator):
                slot.reset(slot_input_ids, slot_attention_mask, selector)
        # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
        # as they have already been generated and sent back in the last decode.
-        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_prefill(
            input_ids, attention_mask, seq_ids
        )
        logits = self.model(**model_inputs)[0]
-        generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
+        generation, next_batch = self._generate_token(
            prefill_slots, self.batch_id, logits, input_ids
        )
        self.batch_id += 1
        # Reactivate previously active slots for the next decode
        for i, slot in enumerate(active_slots):
@ -462,10 +489,14 @@ class NeuronGenerator(Generator):
                slot.append(next_tokens[i])
        logger.debug("Model ready for decoding")
        if next_batch is not None:
-            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
+            logger.debug(
                f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
            )
        return generation, next_batch
-    def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
+    def decode(
        self, batches: List[CachedBatch]
    ) -> Tuple[List[Generation], CachedBatch]:
        """Decode the specified prefilled requests.
        Args:
@ -491,10 +522,14 @@ class NeuronGenerator(Generator):
                cleared_request_ids.append(slot.request_id)
                slot.clear()
        if len(cleared_request_ids) > 0:
-            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
+            logger.info(
                f"Clearing slot for requests {cleared_request_ids} as they are not requested."
            )
        active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
        if len(active_slots) < len(request_ids):
-            raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
+            raise ValueError(
                "Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
            )
        if self.model.continuous_batching:
            decode_slots = active_slots
            seq_ids = torch.tensor([slot.id for slot in decode_slots])
@ -503,7 +538,9 @@ class NeuronGenerator(Generator):
            seq_ids = None
        # Reconstruct input_ids and attention_mask from decode slots
        n_slots = len(decode_slots)
-        input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
+        input_ids = torch.full(
            [n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
        )
        max_length = 0
        for slot in decode_slots:
            max_length = max(max_length, slot.attention_mask.size(-1))
@ -513,12 +550,18 @@ class NeuronGenerator(Generator):
                # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
                input_ids[i, 0] = slot.next_token
                attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
-        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_decode(
            input_ids, attention_mask, seq_ids
        )
        logits = self.model(**model_inputs)[0]
        return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
    def _generate_token(
-        self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
+        self,
        slots: List[Slot],
        next_batch_id: int,
        logits: torch.Tensor,
        input_ids: torch.LongTensor,
    ) -> Tuple[List[Generation], CachedBatch]:
        generations = []
        active_slots = False
@ -542,9 +585,13 @@ class NeuronGenerator(Generator):
            if finish_reason is not None:
                # We must include the generated text for each finished sequence in the response
                generated_text = GeneratedText(
-                    text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
+                    text=slot.generated_text,
                    generated_tokens=slot.generated_tokens,
                    finish_reason=finish_reason,
                )
                logger.debug(
                    f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
                )
                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
                # mark the slot as available
                slot.clear()
            else:
@ -565,7 +612,9 @@ class NeuronGenerator(Generator):
        batch = None
        if active_slots:
            # Whatever initial batch these requests came from, we always return all pending requests in a single batch
-            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
+            request_ids = [
                slot.request_id for slot in self.slots if slot.state == Slot.State.READY
            ]
            batch = self._cached_batch(next_batch_id, request_ids)
        else:
            logger.debug("No more pending requests")
@ -574,7 +623,9 @@ class NeuronGenerator(Generator):
    def _cached_batch(self, batch_id: int, request_ids: List):
        size = len(request_ids)
        max_tokens = size * self.model.max_length
-        return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
+        return CachedBatch(
            id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
        )
    def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
        """Remove requests that are not listed from the specified batch
@ -588,7 +639,9 @@ class NeuronGenerator(Generator):
        Return:
            A `CachedBatch` containing the pending requests.
        """
-        keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
+        keep_slot_ids = [
            slot.id for slot in self.slots if slot.request_id in keep_request_ids
        ]
        self._clear(keep_slot_ids)
        return self._cached_batch(batch_id, keep_request_ids)
@ -625,11 +678,19 @@ class NeuronGenerator(Generator):
            export_kwargs = get_export_kwargs_from_env()
            logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
            model = NeuronModelForCausalLM.from_pretrained(
-                model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
+                model_id,
                revision=revision,
                low_cpu_mem_usage=True,
                export=True,
                **export_kwargs,
            )
        else:
-            logger.info("Loading model on neuron devices (this can take a few minutes).")
+            logger.info(
-            model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
+                "Loading model on neuron devices (this can take a few minutes)."
            )
            model = NeuronModelForCausalLM.from_pretrained(
                model_id, low_cpu_mem_usage=True, revision=revision
            )
        end = time.time()
        logger.info(f"Model successfully loaded in {end - start:.2f} s.")
        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
--- a/backends/neuron/server/text_generation_server/interceptor.py
+++ b/backends/neuron/server/text_generation_server/interceptor.py
@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
            logger.exception(f"Method {method_name} encountered an error.")
            await context.abort_with_status(
-                rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
+                rpc_status.to_status(
                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
                )
            )
--- a/backends/neuron/server/text_generation_server/model.py
+++ b/backends/neuron/server/text_generation_server/model.py
@ -56,7 +56,9 @@ def log_cache_size():
    if os.path.exists(path):
        usage = shutil.disk_usage(path)
        gb = 2**30
-        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
+        logger.info(
            f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
        )
    else:
        raise ValueError(f"The cache directory ({path}) does not exist.")
@ -79,7 +81,9 @@ def fetch_model(
    if not os.path.isdir("/sys/class/neuron_device/"):
        raise SystemError("No neuron cores detected on the host.")
    if os.path.isdir(model_id) and revision is not None:
-        logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
+        logger.warning(
            "Revision {} ignored for local model at {}".format(revision, model_id)
        )
        revision = None
    # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
    # Note that the model may already be present in the cache.
@ -89,12 +93,16 @@ def fetch_model(
        if os.path.isdir(model_id):
            return model_id
        # Prefetch the neuron model from the Hub
-        logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
+        logger.info(
            f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
        )
        log_cache_size()
        return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
    # Model needs to be exported: look for compatible cached entries on the hub
    export_kwargs = get_export_kwargs_from_env()
-    export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
+    export_config = NeuronModelForCausalLM.get_export_config(
        model_id, config, revision=revision, **export_kwargs
    )
    neuron_config = export_config.neuron
    if not is_cached(model_id, neuron_config):
        hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
@ -105,7 +113,9 @@ def fetch_model(
            f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
        )
        raise ValueError(error_msg)
-    logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
+    logger.warning(
        f"{model_id} is not a neuron model: it will be exported using cached artifacts."
    )
    if os.path.isdir(model_id):
        return model_id
    # Prefetch weights, tokenizer and generation config so that they are in cache
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
 MODEL_CONFIGURATIONS = {
    "gpt2": {
        "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 1024,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "llama": {
        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 2048,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "mistral": {
        "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "qwen2": {
        "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "granite": {
        "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
 }
 def get_hub_neuron_model_id(config_name: str):
-    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+    return (
        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
    )
 def export_model(model_id, export_kwargs, neuron_model_path):
-    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    export_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        model_id,
        "--task",
        "text-generation",
    ]
    for kwarg, value in export_kwargs.items():
        export_command.append(f"--{kwarg}")
        export_command.append(str(value))
--- a/backends/neuron/tests/prune_test_models.py
+++ b/backends/neuron/tests/prune_test_models.py
@ -1,5 +1,3 @@
 import os
 from argparse import ArgumentParser
 from huggingface_hub import HfApi
@ -15,7 +13,7 @@ def main():
            delete = True
        else:
            answer = input(f"Do you want to delete {model.id} [y/N] ?")
-            delete = (answer == "y")
+            delete = answer == "y"
        if delete:
            api.delete_repo(model.id)
            print(f"Deleted {model.id}.")
--- a/backends/neuron/tests/server/helpers.py
+++ b/backends/neuron/tests/server/helpers.py
@ -29,22 +29,42 @@ def create_request(
    )
    stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
    return Request(
-        id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
+        id=id,
        inputs=inputs,
        truncate=truncate,
        parameters=parameters,
        stopping_parameters=stopping_parameters,
    )
-def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
+def check_prefill(
    input_text,
    expected_token_id,
    expected_token_text,
    do_sample,
    batch_size,
    model_path,
 ):
    """Verify that a prefill for a single request generates the expected output."""
    generator = NeuronGenerator.from_pretrained(model_path)
    assert generator.model.batch_size >= batch_size
    requests = []
    max_new_tokens = 20
    for i in range(batch_size):
-        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
            create_request(
                id=0,
                inputs=input_text,
                do_sample=do_sample,
                max_new_tokens=max_new_tokens,
            )
        )
    # Let's be pessimistic when estimating max_tokens
    batch_size * (len(input_text) + max_new_tokens)
    max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == batch_size
    # Whatever was passed as max_tokens, the server will correct it
@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample,
        assert tokens.texts == [expected_token_text]
-def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+def check_decode_single(
    input_text, max_new_tokens, generated_text, do_sample, model_path
 ):
    """Verify that a decoding for a single request generates the expected output."""
    generator = NeuronGenerator.from_pretrained(model_path)
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
    )
    max_length = generator.model.max_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@ -16,9 +16,13 @@ def test_decode(neuron_model_config):
 def _test_decode(config_name, generator, do_sample):
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
        "It was a bright cold day in April, and the clocks were striking thirteen."
    )
    max_new_tokens = 20
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
    )
    max_length = generator.model.max_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
--- a/backends/neuron/tests/server/test_generator_slot.py
+++ b/backends/neuron/tests/server/test_generator_slot.py
@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text):
    slot.assign(0, request, GenerationConfig())
    assert slot.cached_text == input_text
-    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    inputs = tokenizer(
        input_text,
        padding="max_length",
        max_length=len(input_text) + 1,
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"][0]
    attention_mask = inputs["attention_mask"][0]
    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@ -21,12 +21,23 @@ def test_prefill(neuron_model_config):
 def _test_prefill(config_name, generator, batch_size, do_sample):
    requests = []
    max_new_tokens = 20
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
        "It was a bright cold day in April, and the clocks were striking thirteen."
    )
    for i in range(batch_size):
-        requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
            create_request(
                id=i,
                inputs=input_text,
                do_sample=do_sample,
                max_new_tokens=max_new_tokens,
            )
        )
    # Let's be pessimistic when estimating max_tokens
    max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == batch_size
    # Whatever was passed as max_tokens, the server will correct it
@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config):
    for i in range(batch_size):
        requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
    max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, _ = generator.prefill(batch)
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
--- a/backends/neuron/tgi_env.py
+++ b/backends/neuron/tgi_env.py
@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version
 logger = logging.getLogger(__name__)
-tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
+tgi_router_env_vars = [
    "MAX_BATCH_SIZE",
    "MAX_TOTAL_TOKENS",
    "MAX_INPUT_TOKENS",
    "MAX_BATCH_PREFILL_TOKENS",
 ]
 tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
 env_config_peering = [
@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
        argv = sys.argv
    # All these are params passed to tgi and intercepted here
    parser.add_argument(
-        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+        "--max-input-tokens",
        type=int,
        default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
    )
    parser.add_argument(
        "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
    )
    parser.add_argument(
        "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
    )
    parser.add_argument(
        "--max-batch-prefill-tokens",
        type=int,
        default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
    )
    parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
    parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
    parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
    parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
    parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
    args = parser.parse_known_args(argv)[0]
    if not args.model_id:
-        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
+        raise Exception(
            "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
        )
    # Override env with cmdline params
    os.environ["MODEL_ID"] = args.model_id
@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config):
        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
        max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
        if not max_batch_prefill_tokens:
-            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
+            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
                max_input_tokens
            )
        f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary):
    return -dictionary["num_cores"], -dictionary["batch_size"]
-def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
+def lookup_compatible_cached_model(
    model_id: str, revision: Optional[str]
 ) -> Optional[Dict[str, Any]]:
    # Reuse the same mechanic as the one in use to configure the tgi server part
    # The only difference here is that we stay as flexible as possible on the compatibility part
    entries = get_hub_cached_entries(model_id, "inference")
-    logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
+    logger.debug(
        "Found %d cached entries for model %s, revision %s",
        len(entries),
        model_id,
        revision,
    )
    all_compatible = []
    for entry in entries:
-        if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
+        if check_env_and_neuron_config_compatibility(
            entry, check_compiler_version=True
        ):
            all_compatible.append(entry)
    if not all_compatible:
@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
    return entry
-def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
+def check_env_and_neuron_config_compatibility(
    neuron_config: Dict[str, Any], check_compiler_version: bool
 ) -> bool:
    logger.debug(
        "Checking the provided neuron config %s is compatible with the local setup and provided environment",
        neuron_config,
@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
    # Local setup compat checks
    if neuron_config["num_cores"] > available_cores:
-        logger.debug("Not enough neuron cores available to run the provided neuron config")
+        logger.debug(
            "Not enough neuron cores available to run the provided neuron config"
        )
        return False
-    if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
+    if (
        check_compiler_version
        and neuron_config["compiler_version"] != neuronxcc_version
    ):
        logger.debug(
            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
            neuronxcc_version,
@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
            )
            return False
-    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    max_input_tokens = int(
        os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
    )
    if max_input_tokens > 0:
        sequence_length = neuron_config["sequence_length"]
        if max_input_tokens >= sequence_length:
@ -191,7 +228,10 @@ def main():
        if not os.getenv(env_var):
            break
    else:
-        logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
+        logger.info(
            "All env vars %s already set, skipping, user know what they are doing",
            env_vars,
        )
        sys.exit(0)
    cache_dir = constants.HF_HUB_CACHE
@ -201,7 +241,9 @@ def main():
    config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
    neuron_config = getattr(config, "neuron", None)
    if neuron_config is not None:
-        compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
+        compatible = check_env_and_neuron_config_compatibility(
            neuron_config, check_compiler_version=False
        )
        if not compatible:
            env_dict = get_env_dict()
            msg = (
@ -213,9 +255,9 @@ def main():
        neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
    if not neuron_config:
-        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
+        msg = (
-            get_env_dict(), available_cores, neuronxcc_version
+            "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
-        )
+        ).format(get_env_dict(), available_cores, neuronxcc_version)
        logger.error(msg)
        raise Exception(msg)
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items):
        def skip_release(item):
            if "release" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="need --release option to run"))
        selectors.append(skip_release)
    if config.getoption("--neuron"):
        def skip_not_neuron(item):
            if "neuron" not in item.keywords:
-                item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option"))
+                item.add_marker(
                    pytest.mark.skip(reason="incompatible with --neuron option")
                )
        selectors.append(skip_not_neuron)
    else:
        def skip_neuron(item):
            if "neuron" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
        selectors.append(skip_neuron)
    for item in items:
        for selector in selectors:
--- a/integration-tests/fixtures/neuron/model.py
+++ b/integration-tests/fixtures/neuron/model.py
@ -30,44 +30,74 @@ logger = logging.getLogger(__file__)
 MODEL_CONFIGURATIONS = {
    "gpt2": {
        "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 1024,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "llama": {
        "model_id": "unsloth/Llama-3.2-1B-Instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 2048,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "mistral": {
        "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "qwen2": {
        "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "granite": {
        "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
 }
 def get_neuron_backend_hash():
    import subprocess
-    res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
+
-                         capture_output=True,
+    res = subprocess.run(
-                         text=True)
+        ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
-    root_dir = res.stdout.split('\n')[0]
+    )
    root_dir = res.stdout.split("\n")[0]
    def get_sha(path):
-        res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
+        res = subprocess.run(
            ["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
            capture_output=True,
-                             text=True)
+            text=True,
        )
        # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
-        sha = res.stdout.split('\t')[0].split(' ')[-1]
+        sha = res.stdout.split("\t")[0].split(" ")[-1]
        return sha.encode()
    # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
    m = hashlib.sha256()
-    m.update(get_sha('backends/neuron'))
+    m.update(get_sha("backends/neuron"))
-    m.update(get_sha('Dockerfile.neuron'))
+    m.update(get_sha("Dockerfile.neuron"))
    return m.hexdigest()[:10]
@ -81,7 +111,9 @@ def get_tgi_docker_image():
        client = docker.from_env()
        images = client.images.list(filters={"reference": "text-generation-inference"})
        if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
                "No text-generation-inference image found on this host to run tests."
            )
        docker_image = images[0].tags[0]
    return docker_image
@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name):
    with tempfile.TemporaryDirectory() as context_dir:
        # Create entrypoint
        model_path = "/data/neuron_model"
-        export_command = f"optimum-cli export neuron -m {model_id} --task text-generation"
+        export_command = (
            f"optimum-cli export neuron -m {model_id} --task text-generation"
        )
        for kwarg, value in export_kwargs.items():
            export_command += f" --{kwarg} {str(value)}"
        export_command += f" {model_path}"
@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name):
        with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
            f.write(docker_content.encode("utf-8"))
            f.flush()
-        image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image)
+        image, logs = client.images.build(
            path=context_dir, dockerfile=f.name, tag=export_image
        )
        logger.info("Successfully built image %s", image.id)
        logger.debug("Build logs %s", logs)
--- a/integration-tests/fixtures/neuron/service.py
+++ b/integration-tests/fixtures/neuron/service.py
@ -27,7 +27,9 @@ def get_tgi_docker_image():
        client = docker.from_env()
        images = client.images.list(filters={"reference": "text-generation-inference"})
        if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
                "No text-generation-inference image found on this host to run tests."
            )
        docker_image = images[0].tags[0]
    return docker_image
@ -131,13 +133,21 @@ def neuron_launcher(event_loop):
        except NotFound:
            pass
-        env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
+        env = {
            "LOG_LEVEL": "info,text_generation_router=debug",
            "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
        }
        if HF_TOKEN is not None:
            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
            env["HF_TOKEN"] = HF_TOKEN
-        for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
+        for var in [
            "MAX_BATCH_SIZE",
            "MAX_TOTAL_TOKENS",
            "HF_AUTO_CAST_TYPE",
            "HF_NUM_CORES",
        ]:
            if var in os.environ:
                env[var] = os.environ[var]
@ -165,7 +175,9 @@ def neuron_launcher(event_loop):
                with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
                    f.write(docker_content.encode("utf-8"))
                    f.flush()
-                image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
+                image, logs = client.images.build(
                    path=context_dir, dockerfile=f.name, tag=test_image
                )
            logger.info("Successfully built image %s", image.id)
            logger.debug("Build logs %s", logs)
        else:
@ -204,7 +216,9 @@ def neuron_launcher(event_loop):
            try:
                container.remove(force=True)
            except Exception as e:
-                logger.error("Error while removing container %s, skipping", container_name)
+                logger.error(
                    "Error while removing container %s, skipping", container_name
                )
                logger.exception(e)
            # Cleanup the build image
@ -243,7 +257,12 @@ def neuron_generate_load():
        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
    ) -> List[TextGenerationOutput]:
        futures = [
-            client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
+            client.text_generation(
                prompt,
                max_new_tokens=max_new_tokens,
                details=True,
                decoder_input_details=True,
            )
            for _ in range(n)
        ]
--- a/integration-tests/neuron/integration/test_generate.py
+++ b/integration-tests/neuron/integration/test_generate.py
@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service):
    # Greedy bounded with input
    response = await tgi_service.client.text_generation(
-        "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
+        "What is Deep Learning?",
        max_new_tokens=17,
        return_full_text=True,
        details=True,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 17
    assert response.generated_text == prompt + greedy_expectations[service_name]
--- a/integration-tests/neuron/integration/test_implicit_env.py
+++ b/integration-tests/neuron/integration/test_implicit_env.py
@ -1,7 +1,6 @@
 import os
 import pytest
 from huggingface_hub.errors import ValidationError
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])