diff --git a/backends/neuron/server/text_generation_server/cli.py b/backends/neuron/server/text_generation_server/cli.py index 409143a91..4a9c47345 100644 --- a/backends/neuron/server/text_generation_server/cli.py +++ b/backends/neuron/server/text_generation_server/cli.py @@ -61,7 +61,9 @@ def serve( ) if trust_remote_code is not None: - logger.warning("'trust_remote_code' argument is not supported and will be ignored.") + logger.warning( + "'trust_remote_code' argument is not supported and will be ignored." + ) # Import here after the logger is added to log potential import exceptions from .server import serve @@ -99,7 +101,9 @@ def download_weights( if extension is not None: logger.warning("'extension' argument is not supported and will be ignored.") if trust_remote_code is not None: - logger.warning("'trust_remote_code' argument is not supported and will be ignored.") + logger.warning( + "'trust_remote_code' argument is not supported and will be ignored." + ) if auto_convert is not None: logger.warning("'auto_convert' argument is not supported and will be ignored.") if merge_lora is not None: diff --git a/backends/neuron/server/text_generation_server/generator.py b/backends/neuron/server/text_generation_server/generator.py index 3ddee690c..b3887e14c 100644 --- a/backends/neuron/server/text_generation_server/generator.py +++ b/backends/neuron/server/text_generation_server/generator.py @@ -146,7 +146,9 @@ class Slot: def generated_tokens(self) -> int: return self._generated_tokens - def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig): + def assign( + self, batch_id: int, request: Request, generation_config: GenerationConfig + ): """Assign a request to a slot. Args: @@ -174,15 +176,24 @@ class Slot: if request.parameters.typical_p != 0: self._generation_config.typical_p = request.parameters.typical_p if request.parameters.repetition_penalty != 0: - self._generation_config.repetition_penalty = request.parameters.repetition_penalty + self._generation_config.repetition_penalty = ( + request.parameters.repetition_penalty + ) self.seed = request.parameters.seed - self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens + self._generation_config.max_new_tokens = ( + request.stopping_parameters.max_new_tokens + ) self._max_new_tokens = self._generation_config.max_new_tokens stop_strings = request.stopping_parameters.stop_sequences if stop_strings: self._generation_config.stop_strings = stop_strings - def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector): + def reset( + self, + input_ids: torch.LongTensor, + attention_mask: torch.LongTensor, + selector: TokenSelector, + ): """Reset the slot for the next generation. Args: @@ -210,7 +221,9 @@ class Slot: self._generated_tokens -= 1 # Since generated tokens are now part of the prefill, we need to reevaluate # max_new_tokens for the next generation - self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens + self._generation_config.max_new_tokens = ( + self._max_new_tokens - self._generated_tokens + ) self._state = Slot.State.PAUSE def resume(self): @@ -223,7 +236,9 @@ class Slot: """Hack to hopefully support generate_stream for the maximum number of tokenizers""" # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode # which decide to add a space or not depending on the surrounding ids. - new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False) + new_text = self._tokenizer.decode( + self._tokens[self._next_text_token_start :], skip_special_tokens=False + ) if new_text.endswith("�"): # utf-8 char at the end means it's a potential unfinished byte sequence # from byte fallback tokenization. @@ -267,7 +282,9 @@ class Slot: self._next_text = next_text return next_text - def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor: + def select( + self, input_ids: torch.LongTensor, logits: torch.Tensor + ) -> torch.LongTensor: """Select the next token from the candidate logits. Args: @@ -384,7 +401,9 @@ class NeuronGenerator(Generator): f" Please align max_batch_size with the static batch size: {self.model.batch_size}." ) # Assign each request to an empty slot - logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)") + logger.debug( + f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)" + ) new_slots = [] for request in batch.requests: slot = empty_slots.pop() @@ -417,7 +436,11 @@ class NeuronGenerator(Generator): max_length = slot.truncate # Tokenize with padding and truncation padded_inputs = self.tokenizer( - inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length + inputs, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length, ) input_ids = padded_inputs.input_ids attention_mask = padded_inputs.attention_mask @@ -450,9 +473,13 @@ class NeuronGenerator(Generator): slot.reset(slot_input_ids, slot_attention_mask, selector) # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored, # as they have already been generated and sent back in the last decode. - model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids) + model_inputs = self.model.prepare_inputs_for_prefill( + input_ids, attention_mask, seq_ids + ) logits = self.model(**model_inputs)[0] - generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids) + generation, next_batch = self._generate_token( + prefill_slots, self.batch_id, logits, input_ids + ) self.batch_id += 1 # Reactivate previously active slots for the next decode for i, slot in enumerate(active_slots): @@ -462,10 +489,14 @@ class NeuronGenerator(Generator): slot.append(next_tokens[i]) logger.debug("Model ready for decoding") if next_batch is not None: - logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}") + logger.debug( + f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}" + ) return generation, next_batch - def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]: + def decode( + self, batches: List[CachedBatch] + ) -> Tuple[List[Generation], CachedBatch]: """Decode the specified prefilled requests. Args: @@ -491,10 +522,14 @@ class NeuronGenerator(Generator): cleared_request_ids.append(slot.request_id) slot.clear() if len(cleared_request_ids) > 0: - logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.") + logger.info( + f"Clearing slot for requests {cleared_request_ids} as they are not requested." + ) active_slots = [slot for slot in self.slots if slot.state == slot.State.READY] if len(active_slots) < len(request_ids): - raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)") + raise ValueError( + "Unable to decode tokens for non-prefilled batches (probably due to a previous failure)" + ) if self.model.continuous_batching: decode_slots = active_slots seq_ids = torch.tensor([slot.id for slot in decode_slots]) @@ -503,7 +538,9 @@ class NeuronGenerator(Generator): seq_ids = None # Reconstruct input_ids and attention_mask from decode slots n_slots = len(decode_slots) - input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64) + input_ids = torch.full( + [n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64 + ) max_length = 0 for slot in decode_slots: max_length = max(max_length, slot.attention_mask.size(-1)) @@ -513,12 +550,18 @@ class NeuronGenerator(Generator): # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached) input_ids[i, 0] = slot.next_token attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask - model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids) + model_inputs = self.model.prepare_inputs_for_decode( + input_ids, attention_mask, seq_ids + ) logits = self.model(**model_inputs)[0] return self._generate_token(decode_slots, next_batch_id, logits, input_ids) def _generate_token( - self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor + self, + slots: List[Slot], + next_batch_id: int, + logits: torch.Tensor, + input_ids: torch.LongTensor, ) -> Tuple[List[Generation], CachedBatch]: generations = [] active_slots = False @@ -542,9 +585,13 @@ class NeuronGenerator(Generator): if finish_reason is not None: # We must include the generated text for each finished sequence in the response generated_text = GeneratedText( - text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason + text=slot.generated_text, + generated_tokens=slot.generated_tokens, + finish_reason=finish_reason, + ) + logger.debug( + f"Decode complete for request {request_id} with {slot.generated_tokens} tokens" ) - logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens") # mark the slot as available slot.clear() else: @@ -565,7 +612,9 @@ class NeuronGenerator(Generator): batch = None if active_slots: # Whatever initial batch these requests came from, we always return all pending requests in a single batch - request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY] + request_ids = [ + slot.request_id for slot in self.slots if slot.state == Slot.State.READY + ] batch = self._cached_batch(next_batch_id, request_ids) else: logger.debug("No more pending requests") @@ -574,7 +623,9 @@ class NeuronGenerator(Generator): def _cached_batch(self, batch_id: int, request_ids: List): size = len(request_ids) max_tokens = size * self.model.max_length - return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens) + return CachedBatch( + id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens + ) def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch: """Remove requests that are not listed from the specified batch @@ -588,7 +639,9 @@ class NeuronGenerator(Generator): Return: A `CachedBatch` containing the pending requests. """ - keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids] + keep_slot_ids = [ + slot.id for slot in self.slots if slot.request_id in keep_request_ids + ] self._clear(keep_slot_ids) return self._cached_batch(batch_id, keep_request_ids) @@ -625,11 +678,19 @@ class NeuronGenerator(Generator): export_kwargs = get_export_kwargs_from_env() logger.info(f"Exporting model to neuron with config: {export_kwargs}.") model = NeuronModelForCausalLM.from_pretrained( - model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs + model_id, + revision=revision, + low_cpu_mem_usage=True, + export=True, + **export_kwargs, ) else: - logger.info("Loading model on neuron devices (this can take a few minutes).") - model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision) + logger.info( + "Loading model on neuron devices (this can take a few minutes)." + ) + model = NeuronModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, revision=revision + ) end = time.time() logger.info(f"Model successfully loaded in {end - start:.2f} s.") tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) diff --git a/backends/neuron/server/text_generation_server/interceptor.py b/backends/neuron/server/text_generation_server/interceptor.py index ed29cdf2c..301cafd87 100644 --- a/backends/neuron/server/text_generation_server/interceptor.py +++ b/backends/neuron/server/text_generation_server/interceptor.py @@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor): logger.exception(f"Method {method_name} encountered an error.") await context.abort_with_status( - rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))) + rpc_status.to_status( + status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)) + ) ) diff --git a/backends/neuron/server/text_generation_server/model.py b/backends/neuron/server/text_generation_server/model.py index e8cb34ee1..2151a9218 100644 --- a/backends/neuron/server/text_generation_server/model.py +++ b/backends/neuron/server/text_generation_server/model.py @@ -56,7 +56,9 @@ def log_cache_size(): if os.path.exists(path): usage = shutil.disk_usage(path) gb = 2**30 - logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G") + logger.info( + f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G" + ) else: raise ValueError(f"The cache directory ({path}) does not exist.") @@ -79,7 +81,9 @@ def fetch_model( if not os.path.isdir("/sys/class/neuron_device/"): raise SystemError("No neuron cores detected on the host.") if os.path.isdir(model_id) and revision is not None: - logger.warning("Revision {} ignored for local model at {}".format(revision, model_id)) + logger.warning( + "Revision {} ignored for local model at {}".format(revision, model_id) + ) revision = None # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model) # Note that the model may already be present in the cache. @@ -89,12 +93,16 @@ def fetch_model( if os.path.isdir(model_id): return model_id # Prefetch the neuron model from the Hub - logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}") + logger.info( + f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}" + ) log_cache_size() return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin") # Model needs to be exported: look for compatible cached entries on the hub export_kwargs = get_export_kwargs_from_env() - export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs) + export_config = NeuronModelForCausalLM.get_export_config( + model_id, config, revision=revision, **export_kwargs + ) neuron_config = export_config.neuron if not is_cached(model_id, neuron_config): hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache" @@ -105,7 +113,9 @@ def fetch_model( f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}" ) raise ValueError(error_msg) - logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.") + logger.warning( + f"{model_id} is not a neuron model: it will be exported using cached artifacts." + ) if os.path.isdir(model_id): return model_id # Prefetch weights, tokenizer and generation config so that they are in cache diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py index 6fa63ce86..4b6a1375d 100644 --- a/backends/neuron/tests/fixtures/model.py +++ b/backends/neuron/tests/fixtures/model.py @@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache" MODEL_CONFIGURATIONS = { "gpt2": { "model_id": "gpt2", - "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 1024, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "llama": { "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", - "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 2048, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "mistral": { "model_id": "optimum/mistral-1.1b-testing", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, }, "qwen2": { "model_id": "Qwen/Qwen2.5-0.5B", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "granite": { "model_id": "ibm-granite/granite-3.1-2b-instruct", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, }, } def get_hub_neuron_model_id(config_name: str): - return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" + return ( + f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" + ) def export_model(model_id, export_kwargs, neuron_model_path): - export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"] + export_command = [ + "optimum-cli", + "export", + "neuron", + "-m", + model_id, + "--task", + "text-generation", + ] for kwarg, value in export_kwargs.items(): export_command.append(f"--{kwarg}") export_command.append(str(value)) diff --git a/backends/neuron/tests/prune_test_models.py b/backends/neuron/tests/prune_test_models.py index 592b1070e..448962fb6 100644 --- a/backends/neuron/tests/prune_test_models.py +++ b/backends/neuron/tests/prune_test_models.py @@ -1,5 +1,3 @@ -import os - from argparse import ArgumentParser from huggingface_hub import HfApi @@ -15,7 +13,7 @@ def main(): delete = True else: answer = input(f"Do you want to delete {model.id} [y/N] ?") - delete = (answer == "y") + delete = answer == "y" if delete: api.delete_repo(model.id) print(f"Deleted {model.id}.") diff --git a/backends/neuron/tests/server/helpers.py b/backends/neuron/tests/server/helpers.py index 81547cb6a..f0f81d06d 100644 --- a/backends/neuron/tests/server/helpers.py +++ b/backends/neuron/tests/server/helpers.py @@ -29,22 +29,42 @@ def create_request( ) stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens) return Request( - id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters + id=id, + inputs=inputs, + truncate=truncate, + parameters=parameters, + stopping_parameters=stopping_parameters, ) -def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path): +def check_prefill( + input_text, + expected_token_id, + expected_token_text, + do_sample, + batch_size, + model_path, +): """Verify that a prefill for a single request generates the expected output.""" generator = NeuronGenerator.from_pretrained(model_path) assert generator.model.batch_size >= batch_size requests = [] max_new_tokens = 20 for i in range(batch_size): - requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens)) + requests.append( + create_request( + id=0, + inputs=input_text, + do_sample=do_sample, + max_new_tokens=max_new_tokens, + ) + ) # Let's be pessimistic when estimating max_tokens batch_size * (len(input_text) + max_new_tokens) max_length = generator.model.max_length - batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) + batch = Batch( + id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length + ) generations, next_batch = generator.prefill(batch) assert next_batch.size == batch_size # Whatever was passed as max_tokens, the server will correct it @@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, assert tokens.texts == [expected_token_text] -def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path): +def check_decode_single( + input_text, max_new_tokens, generated_text, do_sample, model_path +): """Verify that a decoding for a single request generates the expected output.""" generator = NeuronGenerator.from_pretrained(model_path) - request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample) + request = create_request( + id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample + ) max_length = generator.model.max_length batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) generations, next_batch = generator.prefill(batch) diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py index 2ab4c2da0..9db5e3abb 100644 --- a/backends/neuron/tests/server/test_decode.py +++ b/backends/neuron/tests/server/test_decode.py @@ -16,9 +16,13 @@ def test_decode(neuron_model_config): def _test_decode(config_name, generator, do_sample): - input_text = "It was a bright cold day in April, and the clocks were striking thirteen." + input_text = ( + "It was a bright cold day in April, and the clocks were striking thirteen." + ) max_new_tokens = 20 - request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample) + request = create_request( + id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample + ) max_length = generator.model.max_length batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) generations, next_batch = generator.prefill(batch) diff --git a/backends/neuron/tests/server/test_generator_slot.py b/backends/neuron/tests/server/test_generator_slot.py index 459ee3e5b..0c03e9d1e 100644 --- a/backends/neuron/tests/server/test_generator_slot.py +++ b/backends/neuron/tests/server/test_generator_slot.py @@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text): slot.assign(0, request, GenerationConfig()) assert slot.cached_text == input_text - inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt") + inputs = tokenizer( + input_text, + padding="max_length", + max_length=len(input_text) + 1, + return_tensors="pt", + ) input_ids = inputs["input_ids"][0] attention_mask = inputs["attention_mask"][0] generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"] diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index 2120e5c59..c0155b1a1 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -21,12 +21,23 @@ def test_prefill(neuron_model_config): def _test_prefill(config_name, generator, batch_size, do_sample): requests = [] max_new_tokens = 20 - input_text = "It was a bright cold day in April, and the clocks were striking thirteen." + input_text = ( + "It was a bright cold day in April, and the clocks were striking thirteen." + ) for i in range(batch_size): - requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens)) + requests.append( + create_request( + id=i, + inputs=input_text, + do_sample=do_sample, + max_new_tokens=max_new_tokens, + ) + ) # Let's be pessimistic when estimating max_tokens max_length = generator.model.max_length - batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) + batch = Batch( + id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length + ) generations, next_batch = generator.prefill(batch) assert next_batch.size == batch_size # Whatever was passed as max_tokens, the server will correct it @@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config): for i in range(batch_size): requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i])) max_length = generator.model.max_length - batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) + batch = Batch( + id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length + ) generations, _ = generator.prefill(batch) # Even if the input text is identical for all requests, the first generated token might # be different because of the truncation diff --git a/backends/neuron/tgi_env.py b/backends/neuron/tgi_env.py index ff647c988..a7042130b 100755 --- a/backends/neuron/tgi_env.py +++ b/backends/neuron/tgi_env.py @@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version logger = logging.getLogger(__name__) -tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"] +tgi_router_env_vars = [ + "MAX_BATCH_SIZE", + "MAX_TOTAL_TOKENS", + "MAX_INPUT_TOKENS", + "MAX_BATCH_PREFILL_TOKENS", +] tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"] env_config_peering = [ @@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace: argv = sys.argv # All these are params passed to tgi and intercepted here parser.add_argument( - "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)) + "--max-input-tokens", + type=int, + default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)), + ) + parser.add_argument( + "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0) + ) + parser.add_argument( + "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0) + ) + parser.add_argument( + "--max-batch-prefill-tokens", + type=int, + default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0), ) - parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)) - parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)) - parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0)) parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID")) parser.add_argument("--revision", type=str, default=os.getenv("REVISION")) args = parser.parse_known_args(argv)[0] if not args.model_id: - raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var") + raise Exception( + "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var" + ) # Override env with cmdline params os.environ["MODEL_ID"] = args.model_id @@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config): f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens)) max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS") if not max_batch_prefill_tokens: - max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens) + max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int( + max_input_tokens + ) f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens)) @@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary): return -dictionary["num_cores"], -dictionary["batch_size"] -def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]: +def lookup_compatible_cached_model( + model_id: str, revision: Optional[str] +) -> Optional[Dict[str, Any]]: # Reuse the same mechanic as the one in use to configure the tgi server part # The only difference here is that we stay as flexible as possible on the compatibility part entries = get_hub_cached_entries(model_id, "inference") - logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision) + logger.debug( + "Found %d cached entries for model %s, revision %s", + len(entries), + model_id, + revision, + ) all_compatible = [] for entry in entries: - if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True): + if check_env_and_neuron_config_compatibility( + entry, check_compiler_version=True + ): all_compatible.append(entry) if not all_compatible: @@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op return entry -def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool: +def check_env_and_neuron_config_compatibility( + neuron_config: Dict[str, Any], check_compiler_version: bool +) -> bool: logger.debug( "Checking the provided neuron config %s is compatible with the local setup and provided environment", neuron_config, @@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che # Local setup compat checks if neuron_config["num_cores"] > available_cores: - logger.debug("Not enough neuron cores available to run the provided neuron config") + logger.debug( + "Not enough neuron cores available to run the provided neuron config" + ) return False - if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version: + if ( + check_compiler_version + and neuron_config["compiler_version"] != neuronxcc_version + ): logger.debug( "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)", neuronxcc_version, @@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che ) return False - max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))) + max_input_tokens = int( + os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)) + ) if max_input_tokens > 0: sequence_length = neuron_config["sequence_length"] if max_input_tokens >= sequence_length: @@ -191,7 +228,10 @@ def main(): if not os.getenv(env_var): break else: - logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars) + logger.info( + "All env vars %s already set, skipping, user know what they are doing", + env_vars, + ) sys.exit(0) cache_dir = constants.HF_HUB_CACHE @@ -201,7 +241,9 @@ def main(): config = AutoConfig.from_pretrained(args.model_id, revision=args.revision) neuron_config = getattr(config, "neuron", None) if neuron_config is not None: - compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False) + compatible = check_env_and_neuron_config_compatibility( + neuron_config, check_compiler_version=False + ) if not compatible: env_dict = get_env_dict() msg = ( @@ -213,9 +255,9 @@ def main(): neuron_config = lookup_compatible_cached_model(args.model_id, args.revision) if not neuron_config: - msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format( - get_env_dict(), available_cores, neuronxcc_version - ) + msg = ( + "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}" + ).format(get_env_dict(), available_cores, neuronxcc_version) logger.error(msg) raise Exception(msg) diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index ed27e0589..e04510521 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items): def skip_release(item): if "release" in item.keywords: item.add_marker(pytest.mark.skip(reason="need --release option to run")) + selectors.append(skip_release) if config.getoption("--neuron"): + def skip_not_neuron(item): if "neuron" not in item.keywords: - item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option")) + item.add_marker( + pytest.mark.skip(reason="incompatible with --neuron option") + ) + selectors.append(skip_not_neuron) else: + def skip_neuron(item): if "neuron" in item.keywords: item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) + selectors.append(skip_neuron) for item in items: for selector in selectors: diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py index 10f1c84eb..3345e2ea2 100644 --- a/integration-tests/fixtures/neuron/model.py +++ b/integration-tests/fixtures/neuron/model.py @@ -30,44 +30,74 @@ logger = logging.getLogger(__file__) MODEL_CONFIGURATIONS = { "gpt2": { "model_id": "gpt2", - "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 1024, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "llama": { "model_id": "unsloth/Llama-3.2-1B-Instruct", - "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 2048, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "mistral": { "model_id": "optimum/mistral-1.1b-testing", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, }, "qwen2": { "model_id": "Qwen/Qwen2.5-0.5B", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "fp16", + }, }, "granite": { "model_id": "ibm-granite/granite-3.1-2b-instruct", - "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, }, } def get_neuron_backend_hash(): import subprocess - res = subprocess.run(["git", "rev-parse", "--show-toplevel"], - capture_output=True, - text=True) - root_dir = res.stdout.split('\n')[0] + + res = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True + ) + root_dir = res.stdout.split("\n")[0] + def get_sha(path): - res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"], - capture_output=True, - text=True) + res = subprocess.run( + ["git", "ls-tree", "HEAD", f"{root_dir}/{path}"], + capture_output=True, + text=True, + ) # Output of the command is in the form '040000 tree|blob \t\n' - sha = res.stdout.split('\t')[0].split(' ')[-1] + sha = res.stdout.split("\t")[0].split(" ")[-1] return sha.encode() + # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that m = hashlib.sha256() - m.update(get_sha('backends/neuron')) - m.update(get_sha('Dockerfile.neuron')) + m.update(get_sha("backends/neuron")) + m.update(get_sha("Dockerfile.neuron")) return m.hexdigest()[:10] @@ -81,7 +111,9 @@ def get_tgi_docker_image(): client = docker.from_env() images = client.images.list(filters={"reference": "text-generation-inference"}) if not images: - raise ValueError("No text-generation-inference image found on this host to run tests.") + raise ValueError( + "No text-generation-inference image found on this host to run tests." + ) docker_image = images[0].tags[0] return docker_image @@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name): with tempfile.TemporaryDirectory() as context_dir: # Create entrypoint model_path = "/data/neuron_model" - export_command = f"optimum-cli export neuron -m {model_id} --task text-generation" + export_command = ( + f"optimum-cli export neuron -m {model_id} --task text-generation" + ) for kwarg, value in export_kwargs.items(): export_command += f" --{kwarg} {str(value)}" export_command += f" {model_path}" @@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name): with open(os.path.join(context_dir, "Dockerfile"), "wb") as f: f.write(docker_content.encode("utf-8")) f.flush() - image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image) + image, logs = client.images.build( + path=context_dir, dockerfile=f.name, tag=export_image + ) logger.info("Successfully built image %s", image.id) logger.debug("Build logs %s", logs) diff --git a/integration-tests/fixtures/neuron/service.py b/integration-tests/fixtures/neuron/service.py index 927a35af4..39241d6f5 100644 --- a/integration-tests/fixtures/neuron/service.py +++ b/integration-tests/fixtures/neuron/service.py @@ -27,7 +27,9 @@ def get_tgi_docker_image(): client = docker.from_env() images = client.images.list(filters={"reference": "text-generation-inference"}) if not images: - raise ValueError("No text-generation-inference image found on this host to run tests.") + raise ValueError( + "No text-generation-inference image found on this host to run tests." + ) docker_image = images[0].tags[0] return docker_image @@ -131,13 +133,21 @@ def neuron_launcher(event_loop): except NotFound: pass - env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID} + env = { + "LOG_LEVEL": "info,text_generation_router=debug", + "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID, + } if HF_TOKEN is not None: env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN env["HF_TOKEN"] = HF_TOKEN - for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]: + for var in [ + "MAX_BATCH_SIZE", + "MAX_TOTAL_TOKENS", + "HF_AUTO_CAST_TYPE", + "HF_NUM_CORES", + ]: if var in os.environ: env[var] = os.environ[var] @@ -165,7 +175,9 @@ def neuron_launcher(event_loop): with open(os.path.join(context_dir, "Dockerfile"), "wb") as f: f.write(docker_content.encode("utf-8")) f.flush() - image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image) + image, logs = client.images.build( + path=context_dir, dockerfile=f.name, tag=test_image + ) logger.info("Successfully built image %s", image.id) logger.debug("Build logs %s", logs) else: @@ -204,7 +216,9 @@ def neuron_launcher(event_loop): try: container.remove(force=True) except Exception as e: - logger.error("Error while removing container %s, skipping", container_name) + logger.error( + "Error while removing container %s, skipping", container_name + ) logger.exception(e) # Cleanup the build image @@ -243,7 +257,12 @@ def neuron_generate_load(): client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int ) -> List[TextGenerationOutput]: futures = [ - client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True) + client.text_generation( + prompt, + max_new_tokens=max_new_tokens, + details=True, + decoder_input_details=True, + ) for _ in range(n) ] diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/integration/test_generate.py index 96f09838b..6a1b49904 100644 --- a/integration-tests/neuron/integration/test_generate.py +++ b/integration-tests/neuron/integration/test_generate.py @@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service): # Greedy bounded with input response = await tgi_service.client.text_generation( - "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True + "What is Deep Learning?", + max_new_tokens=17, + return_full_text=True, + details=True, + decoder_input_details=True, ) assert response.details.generated_tokens == 17 assert response.generated_text == prompt + greedy_expectations[service_name] diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/integration/test_implicit_env.py index cf0c80620..7eee7b337 100644 --- a/integration-tests/neuron/integration/test_implicit_env.py +++ b/integration-tests/neuron/integration/test_implicit_env.py @@ -1,7 +1,6 @@ import os import pytest -from huggingface_hub.errors import ValidationError @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])