diff --git a/backends/neuron/server/text_generation_server/cli.py b/backends/neuron/server/text_generation_server/cli.py
index 409143a9..4a9c4734 100644
--- a/backends/neuron/server/text_generation_server/cli.py
+++ b/backends/neuron/server/text_generation_server/cli.py
@@ -61,7 +61,9 @@ def serve(
     )
 
     if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
+            "'trust_remote_code' argument is not supported and will be ignored."
+        )
 
     # Import here after the logger is added to log potential import exceptions
     from .server import serve
@@ -99,7 +101,9 @@ def download_weights(
     if extension is not None:
         logger.warning("'extension' argument is not supported and will be ignored.")
     if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
+            "'trust_remote_code' argument is not supported and will be ignored."
+        )
     if auto_convert is not None:
         logger.warning("'auto_convert' argument is not supported and will be ignored.")
     if merge_lora is not None:
diff --git a/backends/neuron/server/text_generation_server/generator.py b/backends/neuron/server/text_generation_server/generator.py
index 3ddee690..b3887e14 100644
--- a/backends/neuron/server/text_generation_server/generator.py
+++ b/backends/neuron/server/text_generation_server/generator.py
@@ -146,7 +146,9 @@ class Slot:
     def generated_tokens(self) -> int:
         return self._generated_tokens
 
-    def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
+    def assign(
+        self, batch_id: int, request: Request, generation_config: GenerationConfig
+    ):
         """Assign a request to a slot.
 
         Args:
@@ -174,15 +176,24 @@ class Slot:
             if request.parameters.typical_p != 0:
                 self._generation_config.typical_p = request.parameters.typical_p
         if request.parameters.repetition_penalty != 0:
-            self._generation_config.repetition_penalty = request.parameters.repetition_penalty
+            self._generation_config.repetition_penalty = (
+                request.parameters.repetition_penalty
+            )
         self.seed = request.parameters.seed
-        self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
+        self._generation_config.max_new_tokens = (
+            request.stopping_parameters.max_new_tokens
+        )
         self._max_new_tokens = self._generation_config.max_new_tokens
         stop_strings = request.stopping_parameters.stop_sequences
         if stop_strings:
             self._generation_config.stop_strings = stop_strings
 
-    def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
+    def reset(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        selector: TokenSelector,
+    ):
         """Reset the slot for the next generation.
 
         Args:
@@ -210,7 +221,9 @@ class Slot:
             self._generated_tokens -= 1
             # Since generated tokens are now part of the prefill, we need to reevaluate
             # max_new_tokens for the next generation
-            self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
+            self._generation_config.max_new_tokens = (
+                self._max_new_tokens - self._generated_tokens
+            )
         self._state = Slot.State.PAUSE
 
     def resume(self):
@@ -223,7 +236,9 @@ class Slot:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
         # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
         # which decide to add a space or not depending on the surrounding ids.
-        new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
+        new_text = self._tokenizer.decode(
+            self._tokens[self._next_text_token_start :], skip_special_tokens=False
+        )
         if new_text.endswith("�"):
             # utf-8 char at the end means it's a potential unfinished byte sequence
             # from byte fallback tokenization.
@@ -267,7 +282,9 @@ class Slot:
         self._next_text = next_text
         return next_text
 
-    def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
+    def select(
+        self, input_ids: torch.LongTensor, logits: torch.Tensor
+    ) -> torch.LongTensor:
         """Select the next token from the candidate logits.
 
         Args:
@@ -384,7 +401,9 @@ class NeuronGenerator(Generator):
                 f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
             )
         # Assign each request to an empty slot
-        logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
+        logger.debug(
+            f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
+        )
         new_slots = []
         for request in batch.requests:
             slot = empty_slots.pop()
@@ -417,7 +436,11 @@ class NeuronGenerator(Generator):
                 max_length = slot.truncate
         # Tokenize with padding and truncation
         padded_inputs = self.tokenizer(
-            inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
         )
         input_ids = padded_inputs.input_ids
         attention_mask = padded_inputs.attention_mask
@@ -450,9 +473,13 @@ class NeuronGenerator(Generator):
                 slot.reset(slot_input_ids, slot_attention_mask, selector)
         # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
         # as they have already been generated and sent back in the last decode.
-        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_prefill(
+            input_ids, attention_mask, seq_ids
+        )
         logits = self.model(**model_inputs)[0]
-        generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
+        generation, next_batch = self._generate_token(
+            prefill_slots, self.batch_id, logits, input_ids
+        )
         self.batch_id += 1
         # Reactivate previously active slots for the next decode
         for i, slot in enumerate(active_slots):
@@ -462,10 +489,14 @@ class NeuronGenerator(Generator):
                 slot.append(next_tokens[i])
         logger.debug("Model ready for decoding")
         if next_batch is not None:
-            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
+            logger.debug(
+                f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
+            )
         return generation, next_batch
 
-    def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
+    def decode(
+        self, batches: List[CachedBatch]
+    ) -> Tuple[List[Generation], CachedBatch]:
         """Decode the specified prefilled requests.
 
         Args:
@@ -491,10 +522,14 @@ class NeuronGenerator(Generator):
                 cleared_request_ids.append(slot.request_id)
                 slot.clear()
         if len(cleared_request_ids) > 0:
-            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
+            logger.info(
+                f"Clearing slot for requests {cleared_request_ids} as they are not requested."
+            )
         active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
         if len(active_slots) < len(request_ids):
-            raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
+            raise ValueError(
+                "Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
+            )
         if self.model.continuous_batching:
             decode_slots = active_slots
             seq_ids = torch.tensor([slot.id for slot in decode_slots])
@@ -503,7 +538,9 @@ class NeuronGenerator(Generator):
             seq_ids = None
         # Reconstruct input_ids and attention_mask from decode slots
         n_slots = len(decode_slots)
-        input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
+        input_ids = torch.full(
+            [n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
+        )
         max_length = 0
         for slot in decode_slots:
             max_length = max(max_length, slot.attention_mask.size(-1))
@@ -513,12 +550,18 @@ class NeuronGenerator(Generator):
                 # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
                 input_ids[i, 0] = slot.next_token
                 attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
-        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_decode(
+            input_ids, attention_mask, seq_ids
+        )
         logits = self.model(**model_inputs)[0]
         return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
 
     def _generate_token(
-        self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
+        self,
+        slots: List[Slot],
+        next_batch_id: int,
+        logits: torch.Tensor,
+        input_ids: torch.LongTensor,
     ) -> Tuple[List[Generation], CachedBatch]:
         generations = []
         active_slots = False
@@ -542,9 +585,13 @@ class NeuronGenerator(Generator):
             if finish_reason is not None:
                 # We must include the generated text for each finished sequence in the response
                 generated_text = GeneratedText(
-                    text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
+                    text=slot.generated_text,
+                    generated_tokens=slot.generated_tokens,
+                    finish_reason=finish_reason,
+                )
+                logger.debug(
+                    f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
                 )
-                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
                 # mark the slot as available
                 slot.clear()
             else:
@@ -565,7 +612,9 @@ class NeuronGenerator(Generator):
         batch = None
         if active_slots:
             # Whatever initial batch these requests came from, we always return all pending requests in a single batch
-            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
+            request_ids = [
+                slot.request_id for slot in self.slots if slot.state == Slot.State.READY
+            ]
             batch = self._cached_batch(next_batch_id, request_ids)
         else:
             logger.debug("No more pending requests")
@@ -574,7 +623,9 @@ class NeuronGenerator(Generator):
     def _cached_batch(self, batch_id: int, request_ids: List):
         size = len(request_ids)
         max_tokens = size * self.model.max_length
-        return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
+        return CachedBatch(
+            id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
+        )
 
     def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
         """Remove requests that are not listed from the specified batch
@@ -588,7 +639,9 @@ class NeuronGenerator(Generator):
         Return:
             A `CachedBatch` containing the pending requests.
         """
-        keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
+        keep_slot_ids = [
+            slot.id for slot in self.slots if slot.request_id in keep_request_ids
+        ]
         self._clear(keep_slot_ids)
         return self._cached_batch(batch_id, keep_request_ids)
 
@@ -625,11 +678,19 @@ class NeuronGenerator(Generator):
             export_kwargs = get_export_kwargs_from_env()
             logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
             model = NeuronModelForCausalLM.from_pretrained(
-                model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
+                model_id,
+                revision=revision,
+                low_cpu_mem_usage=True,
+                export=True,
+                **export_kwargs,
             )
         else:
-            logger.info("Loading model on neuron devices (this can take a few minutes).")
-            model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
+            logger.info(
+                "Loading model on neuron devices (this can take a few minutes)."
+            )
+            model = NeuronModelForCausalLM.from_pretrained(
+                model_id, low_cpu_mem_usage=True, revision=revision
+            )
         end = time.time()
         logger.info(f"Model successfully loaded in {end - start:.2f} s.")
         tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
diff --git a/backends/neuron/server/text_generation_server/interceptor.py b/backends/neuron/server/text_generation_server/interceptor.py
index ed29cdf2..301cafd8 100644
--- a/backends/neuron/server/text_generation_server/interceptor.py
+++ b/backends/neuron/server/text_generation_server/interceptor.py
@@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
             logger.exception(f"Method {method_name} encountered an error.")
 
             await context.abort_with_status(
-                rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
+                rpc_status.to_status(
+                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
+                )
             )
diff --git a/backends/neuron/server/text_generation_server/model.py b/backends/neuron/server/text_generation_server/model.py
index e8cb34ee..2151a921 100644
--- a/backends/neuron/server/text_generation_server/model.py
+++ b/backends/neuron/server/text_generation_server/model.py
@@ -56,7 +56,9 @@ def log_cache_size():
     if os.path.exists(path):
         usage = shutil.disk_usage(path)
         gb = 2**30
-        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
+        logger.info(
+            f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
+        )
     else:
         raise ValueError(f"The cache directory ({path}) does not exist.")
 
@@ -79,7 +81,9 @@ def fetch_model(
     if not os.path.isdir("/sys/class/neuron_device/"):
         raise SystemError("No neuron cores detected on the host.")
     if os.path.isdir(model_id) and revision is not None:
-        logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
+        logger.warning(
+            "Revision {} ignored for local model at {}".format(revision, model_id)
+        )
         revision = None
     # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
     # Note that the model may already be present in the cache.
@@ -89,12 +93,16 @@ def fetch_model(
         if os.path.isdir(model_id):
             return model_id
         # Prefetch the neuron model from the Hub
-        logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
+        logger.info(
+            f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
+        )
         log_cache_size()
         return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
     # Model needs to be exported: look for compatible cached entries on the hub
     export_kwargs = get_export_kwargs_from_env()
-    export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
+    export_config = NeuronModelForCausalLM.get_export_config(
+        model_id, config, revision=revision, **export_kwargs
+    )
     neuron_config = export_config.neuron
     if not is_cached(model_id, neuron_config):
         hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
@@ -105,7 +113,9 @@ def fetch_model(
             f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
         )
         raise ValueError(error_msg)
-    logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
+    logger.warning(
+        f"{model_id} is not a neuron model: it will be exported using cached artifacts."
+    )
     if os.path.isdir(model_id):
         return model_id
     # Prefetch weights, tokenizer and generation config so that they are in cache
diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py
index 6fa63ce8..4b6a1375 100644
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
 MODEL_CONFIGURATIONS = {
     "gpt2": {
         "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 1024,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "llama": {
         "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 2048,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
     "qwen2": {
         "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "granite": {
         "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
 }
 
 
 def get_hub_neuron_model_id(config_name: str):
-    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+    return (
+        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+    )
 
 
 def export_model(model_id, export_kwargs, neuron_model_path):
-    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    export_command = [
+        "optimum-cli",
+        "export",
+        "neuron",
+        "-m",
+        model_id,
+        "--task",
+        "text-generation",
+    ]
     for kwarg, value in export_kwargs.items():
         export_command.append(f"--{kwarg}")
         export_command.append(str(value))
diff --git a/backends/neuron/tests/prune_test_models.py b/backends/neuron/tests/prune_test_models.py
index 592b1070..448962fb 100644
--- a/backends/neuron/tests/prune_test_models.py
+++ b/backends/neuron/tests/prune_test_models.py
@@ -1,5 +1,3 @@
-import os
-
 from argparse import ArgumentParser
 from huggingface_hub import HfApi
 
@@ -15,7 +13,7 @@ def main():
             delete = True
         else:
             answer = input(f"Do you want to delete {model.id} [y/N] ?")
-            delete = (answer == "y")
+            delete = answer == "y"
         if delete:
             api.delete_repo(model.id)
             print(f"Deleted {model.id}.")
diff --git a/backends/neuron/tests/server/helpers.py b/backends/neuron/tests/server/helpers.py
index 81547cb6..f0f81d06 100644
--- a/backends/neuron/tests/server/helpers.py
+++ b/backends/neuron/tests/server/helpers.py
@@ -29,22 +29,42 @@ def create_request(
     )
     stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
     return Request(
-        id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
+        id=id,
+        inputs=inputs,
+        truncate=truncate,
+        parameters=parameters,
+        stopping_parameters=stopping_parameters,
     )
 
 
-def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
+def check_prefill(
+    input_text,
+    expected_token_id,
+    expected_token_text,
+    do_sample,
+    batch_size,
+    model_path,
+):
     """Verify that a prefill for a single request generates the expected output."""
     generator = NeuronGenerator.from_pretrained(model_path)
     assert generator.model.batch_size >= batch_size
     requests = []
     max_new_tokens = 20
     for i in range(batch_size):
-        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
+            create_request(
+                id=0,
+                inputs=input_text,
+                do_sample=do_sample,
+                max_new_tokens=max_new_tokens,
+            )
+        )
     # Let's be pessimistic when estimating max_tokens
     batch_size * (len(input_text) + max_new_tokens)
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, next_batch = generator.prefill(batch)
     assert next_batch.size == batch_size
     # Whatever was passed as max_tokens, the server will correct it
@@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample,
         assert tokens.texts == [expected_token_text]
 
 
-def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+def check_decode_single(
+    input_text, max_new_tokens, generated_text, do_sample, model_path
+):
     """Verify that a decoding for a single request generates the expected output."""
     generator = NeuronGenerator.from_pretrained(model_path)
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
+        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
+    )
     max_length = generator.model.max_length
     batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
     generations, next_batch = generator.prefill(batch)
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
index 2ab4c2da..9db5e3ab 100644
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@@ -16,9 +16,13 @@ def test_decode(neuron_model_config):
 
 
 def _test_decode(config_name, generator, do_sample):
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
+        "It was a bright cold day in April, and the clocks were striking thirteen."
+    )
     max_new_tokens = 20
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
+        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
+    )
     max_length = generator.model.max_length
     batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
     generations, next_batch = generator.prefill(batch)
diff --git a/backends/neuron/tests/server/test_generator_slot.py b/backends/neuron/tests/server/test_generator_slot.py
index 459ee3e5..0c03e9d1 100644
--- a/backends/neuron/tests/server/test_generator_slot.py
+++ b/backends/neuron/tests/server/test_generator_slot.py
@@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text):
     slot.assign(0, request, GenerationConfig())
     assert slot.cached_text == input_text
 
-    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    inputs = tokenizer(
+        input_text,
+        padding="max_length",
+        max_length=len(input_text) + 1,
+        return_tensors="pt",
+    )
     input_ids = inputs["input_ids"][0]
     attention_mask = inputs["attention_mask"][0]
     generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
index 2120e5c5..c0155b1a 100644
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -21,12 +21,23 @@ def test_prefill(neuron_model_config):
 def _test_prefill(config_name, generator, batch_size, do_sample):
     requests = []
     max_new_tokens = 20
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
+        "It was a bright cold day in April, and the clocks were striking thirteen."
+    )
     for i in range(batch_size):
-        requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
+            create_request(
+                id=i,
+                inputs=input_text,
+                do_sample=do_sample,
+                max_new_tokens=max_new_tokens,
+            )
+        )
     # Let's be pessimistic when estimating max_tokens
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, next_batch = generator.prefill(batch)
     assert next_batch.size == batch_size
     # Whatever was passed as max_tokens, the server will correct it
@@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config):
     for i in range(batch_size):
         requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, _ = generator.prefill(batch)
     # Even if the input text is identical for all requests, the first generated token might
     # be different because of the truncation
diff --git a/backends/neuron/tgi_env.py b/backends/neuron/tgi_env.py
index ff647c98..a7042130 100755
--- a/backends/neuron/tgi_env.py
+++ b/backends/neuron/tgi_env.py
@@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version
 
 logger = logging.getLogger(__name__)
 
-tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
+tgi_router_env_vars = [
+    "MAX_BATCH_SIZE",
+    "MAX_TOTAL_TOKENS",
+    "MAX_INPUT_TOKENS",
+    "MAX_BATCH_PREFILL_TOKENS",
+]
 tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
 
 env_config_peering = [
@@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
         argv = sys.argv
     # All these are params passed to tgi and intercepted here
     parser.add_argument(
-        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+        "--max-input-tokens",
+        type=int,
+        default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
+    )
+    parser.add_argument(
+        "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
+    )
+    parser.add_argument(
+        "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
+    )
+    parser.add_argument(
+        "--max-batch-prefill-tokens",
+        type=int,
+        default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
     )
-    parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
-    parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
-    parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
     parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
     parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
 
     args = parser.parse_known_args(argv)[0]
 
     if not args.model_id:
-        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
+        raise Exception(
+            "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
+        )
 
     # Override env with cmdline params
     os.environ["MODEL_ID"] = args.model_id
@@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config):
         f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
         max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
         if not max_batch_prefill_tokens:
-            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
+            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
+                max_input_tokens
+            )
         f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
 
 
@@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary):
     return -dictionary["num_cores"], -dictionary["batch_size"]
 
 
-def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
+def lookup_compatible_cached_model(
+    model_id: str, revision: Optional[str]
+) -> Optional[Dict[str, Any]]:
     # Reuse the same mechanic as the one in use to configure the tgi server part
     # The only difference here is that we stay as flexible as possible on the compatibility part
     entries = get_hub_cached_entries(model_id, "inference")
 
-    logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
+    logger.debug(
+        "Found %d cached entries for model %s, revision %s",
+        len(entries),
+        model_id,
+        revision,
+    )
 
     all_compatible = []
     for entry in entries:
-        if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
+        if check_env_and_neuron_config_compatibility(
+            entry, check_compiler_version=True
+        ):
             all_compatible.append(entry)
 
     if not all_compatible:
@@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
     return entry
 
 
-def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
+def check_env_and_neuron_config_compatibility(
+    neuron_config: Dict[str, Any], check_compiler_version: bool
+) -> bool:
     logger.debug(
         "Checking the provided neuron config %s is compatible with the local setup and provided environment",
         neuron_config,
@@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
 
     # Local setup compat checks
     if neuron_config["num_cores"] > available_cores:
-        logger.debug("Not enough neuron cores available to run the provided neuron config")
+        logger.debug(
+            "Not enough neuron cores available to run the provided neuron config"
+        )
         return False
 
-    if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
+    if (
+        check_compiler_version
+        and neuron_config["compiler_version"] != neuronxcc_version
+    ):
         logger.debug(
             "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
             neuronxcc_version,
@@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
             )
             return False
 
-    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    max_input_tokens = int(
+        os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
     if max_input_tokens > 0:
         sequence_length = neuron_config["sequence_length"]
         if max_input_tokens >= sequence_length:
@@ -191,7 +228,10 @@ def main():
         if not os.getenv(env_var):
             break
     else:
-        logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
+        logger.info(
+            "All env vars %s already set, skipping, user know what they are doing",
+            env_vars,
+        )
         sys.exit(0)
 
     cache_dir = constants.HF_HUB_CACHE
@@ -201,7 +241,9 @@ def main():
     config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
     neuron_config = getattr(config, "neuron", None)
     if neuron_config is not None:
-        compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
+        compatible = check_env_and_neuron_config_compatibility(
+            neuron_config, check_compiler_version=False
+        )
         if not compatible:
             env_dict = get_env_dict()
             msg = (
@@ -213,9 +255,9 @@ def main():
         neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
 
     if not neuron_config:
-        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
-            get_env_dict(), available_cores, neuronxcc_version
-        )
+        msg = (
+            "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
+        ).format(get_env_dict(), available_cores, neuronxcc_version)
         logger.error(msg)
         raise Exception(msg)
 
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index ed27e058..e0451052 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items):
         def skip_release(item):
             if "release" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="need --release option to run"))
+
         selectors.append(skip_release)
     if config.getoption("--neuron"):
+
         def skip_not_neuron(item):
             if "neuron" not in item.keywords:
-                item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option"))
+                item.add_marker(
+                    pytest.mark.skip(reason="incompatible with --neuron option")
+                )
+
         selectors.append(skip_not_neuron)
     else:
+
         def skip_neuron(item):
             if "neuron" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
+
         selectors.append(skip_neuron)
     for item in items:
         for selector in selectors:
diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py
index 10f1c84e..3345e2ea 100644
--- a/integration-tests/fixtures/neuron/model.py
+++ b/integration-tests/fixtures/neuron/model.py
@@ -30,44 +30,74 @@ logger = logging.getLogger(__file__)
 MODEL_CONFIGURATIONS = {
     "gpt2": {
         "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 1024,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "llama": {
         "model_id": "unsloth/Llama-3.2-1B-Instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 2048,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
     "qwen2": {
         "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "granite": {
         "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
 }
 
 
 def get_neuron_backend_hash():
     import subprocess
-    res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
-                         capture_output=True,
-                         text=True)
-    root_dir = res.stdout.split('\n')[0]
+
+    res = subprocess.run(
+        ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
+    )
+    root_dir = res.stdout.split("\n")[0]
+
     def get_sha(path):
-        res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
-                             capture_output=True,
-                             text=True)
+        res = subprocess.run(
+            ["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
+            capture_output=True,
+            text=True,
+        )
         # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
-        sha = res.stdout.split('\t')[0].split(' ')[-1]
+        sha = res.stdout.split("\t")[0].split(" ")[-1]
         return sha.encode()
+
     # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
     m = hashlib.sha256()
-    m.update(get_sha('backends/neuron'))
-    m.update(get_sha('Dockerfile.neuron'))
+    m.update(get_sha("backends/neuron"))
+    m.update(get_sha("Dockerfile.neuron"))
     return m.hexdigest()[:10]
 
 
@@ -81,7 +111,9 @@ def get_tgi_docker_image():
         client = docker.from_env()
         images = client.images.list(filters={"reference": "text-generation-inference"})
         if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
+                "No text-generation-inference image found on this host to run tests."
+            )
         docker_image = images[0].tags[0]
     return docker_image
 
@@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name):
     with tempfile.TemporaryDirectory() as context_dir:
         # Create entrypoint
         model_path = "/data/neuron_model"
-        export_command = f"optimum-cli export neuron -m {model_id} --task text-generation"
+        export_command = (
+            f"optimum-cli export neuron -m {model_id} --task text-generation"
+        )
         for kwarg, value in export_kwargs.items():
             export_command += f" --{kwarg} {str(value)}"
         export_command += f" {model_path}"
@@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name):
         with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
             f.write(docker_content.encode("utf-8"))
             f.flush()
-        image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image)
+        image, logs = client.images.build(
+            path=context_dir, dockerfile=f.name, tag=export_image
+        )
         logger.info("Successfully built image %s", image.id)
         logger.debug("Build logs %s", logs)
 
diff --git a/integration-tests/fixtures/neuron/service.py b/integration-tests/fixtures/neuron/service.py
index 927a35af..39241d6f 100644
--- a/integration-tests/fixtures/neuron/service.py
+++ b/integration-tests/fixtures/neuron/service.py
@@ -27,7 +27,9 @@ def get_tgi_docker_image():
         client = docker.from_env()
         images = client.images.list(filters={"reference": "text-generation-inference"})
         if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
+                "No text-generation-inference image found on this host to run tests."
+            )
         docker_image = images[0].tags[0]
     return docker_image
 
@@ -131,13 +133,21 @@ def neuron_launcher(event_loop):
         except NotFound:
             pass
 
-        env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router=debug",
+            "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
+        }
 
         if HF_TOKEN is not None:
             env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
             env["HF_TOKEN"] = HF_TOKEN
 
-        for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
+        for var in [
+            "MAX_BATCH_SIZE",
+            "MAX_TOTAL_TOKENS",
+            "HF_AUTO_CAST_TYPE",
+            "HF_NUM_CORES",
+        ]:
             if var in os.environ:
                 env[var] = os.environ[var]
 
@@ -165,7 +175,9 @@ def neuron_launcher(event_loop):
                 with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
                     f.write(docker_content.encode("utf-8"))
                     f.flush()
-                image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
+                image, logs = client.images.build(
+                    path=context_dir, dockerfile=f.name, tag=test_image
+                )
             logger.info("Successfully built image %s", image.id)
             logger.debug("Build logs %s", logs)
         else:
@@ -204,7 +216,9 @@ def neuron_launcher(event_loop):
             try:
                 container.remove(force=True)
             except Exception as e:
-                logger.error("Error while removing container %s, skipping", container_name)
+                logger.error(
+                    "Error while removing container %s, skipping", container_name
+                )
                 logger.exception(e)
 
             # Cleanup the build image
@@ -243,7 +257,12 @@ def neuron_generate_load():
         client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
     ) -> List[TextGenerationOutput]:
         futures = [
-            client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
+            client.text_generation(
+                prompt,
+                max_new_tokens=max_new_tokens,
+                details=True,
+                decoder_input_details=True,
+            )
             for _ in range(n)
         ]
 
diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/integration/test_generate.py
index 96f09838..6a1b4990 100644
--- a/integration-tests/neuron/integration/test_generate.py
+++ b/integration-tests/neuron/integration/test_generate.py
@@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service):
 
     # Greedy bounded with input
     response = await tgi_service.client.text_generation(
-        "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        details=True,
+        decoder_input_details=True,
     )
     assert response.details.generated_tokens == 17
     assert response.generated_text == prompt + greedy_expectations[service_name]
diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/integration/test_implicit_env.py
index cf0c8062..7eee7b33 100644
--- a/integration-tests/neuron/integration/test_implicit_env.py
+++ b/integration-tests/neuron/integration/test_implicit_env.py
@@ -1,7 +1,6 @@
 import os
 
 import pytest
-from huggingface_hub.errors import ValidationError
 
 
 @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])