fix: run linters and fix formatting (#3057)

This commit is contained in:
drbh 2025-02-25 16:11:34 -05:00 committed by GitHub
parent d7a24c03cf
commit b0069e0485
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 366 additions and 103 deletions

View File

@ -61,7 +61,9 @@ def serve(
) )
if trust_remote_code is not None: if trust_remote_code is not None:
logger.warning("'trust_remote_code' argument is not supported and will be ignored.") logger.warning(
"'trust_remote_code' argument is not supported and will be ignored."
)
# Import here after the logger is added to log potential import exceptions # Import here after the logger is added to log potential import exceptions
from .server import serve from .server import serve
@ -99,7 +101,9 @@ def download_weights(
if extension is not None: if extension is not None:
logger.warning("'extension' argument is not supported and will be ignored.") logger.warning("'extension' argument is not supported and will be ignored.")
if trust_remote_code is not None: if trust_remote_code is not None:
logger.warning("'trust_remote_code' argument is not supported and will be ignored.") logger.warning(
"'trust_remote_code' argument is not supported and will be ignored."
)
if auto_convert is not None: if auto_convert is not None:
logger.warning("'auto_convert' argument is not supported and will be ignored.") logger.warning("'auto_convert' argument is not supported and will be ignored.")
if merge_lora is not None: if merge_lora is not None:

View File

@ -146,7 +146,9 @@ class Slot:
def generated_tokens(self) -> int: def generated_tokens(self) -> int:
return self._generated_tokens return self._generated_tokens
def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig): def assign(
self, batch_id: int, request: Request, generation_config: GenerationConfig
):
"""Assign a request to a slot. """Assign a request to a slot.
Args: Args:
@ -174,15 +176,24 @@ class Slot:
if request.parameters.typical_p != 0: if request.parameters.typical_p != 0:
self._generation_config.typical_p = request.parameters.typical_p self._generation_config.typical_p = request.parameters.typical_p
if request.parameters.repetition_penalty != 0: if request.parameters.repetition_penalty != 0:
self._generation_config.repetition_penalty = request.parameters.repetition_penalty self._generation_config.repetition_penalty = (
request.parameters.repetition_penalty
)
self.seed = request.parameters.seed self.seed = request.parameters.seed
self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens self._generation_config.max_new_tokens = (
request.stopping_parameters.max_new_tokens
)
self._max_new_tokens = self._generation_config.max_new_tokens self._max_new_tokens = self._generation_config.max_new_tokens
stop_strings = request.stopping_parameters.stop_sequences stop_strings = request.stopping_parameters.stop_sequences
if stop_strings: if stop_strings:
self._generation_config.stop_strings = stop_strings self._generation_config.stop_strings = stop_strings
def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector): def reset(
self,
input_ids: torch.LongTensor,
attention_mask: torch.LongTensor,
selector: TokenSelector,
):
"""Reset the slot for the next generation. """Reset the slot for the next generation.
Args: Args:
@ -210,7 +221,9 @@ class Slot:
self._generated_tokens -= 1 self._generated_tokens -= 1
# Since generated tokens are now part of the prefill, we need to reevaluate # Since generated tokens are now part of the prefill, we need to reevaluate
# max_new_tokens for the next generation # max_new_tokens for the next generation
self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens self._generation_config.max_new_tokens = (
self._max_new_tokens - self._generated_tokens
)
self._state = Slot.State.PAUSE self._state = Slot.State.PAUSE
def resume(self): def resume(self):
@ -223,7 +236,9 @@ class Slot:
"""Hack to hopefully support generate_stream for the maximum number of tokenizers""" """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
# We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
# which decide to add a space or not depending on the surrounding ids. # which decide to add a space or not depending on the surrounding ids.
new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False) new_text = self._tokenizer.decode(
self._tokens[self._next_text_token_start :], skip_special_tokens=False
)
if new_text.endswith("<EFBFBD>"): if new_text.endswith("<EFBFBD>"):
# utf-8 char at the end means it's a potential unfinished byte sequence # utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization. # from byte fallback tokenization.
@ -267,7 +282,9 @@ class Slot:
self._next_text = next_text self._next_text = next_text
return next_text return next_text
def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor: def select(
self, input_ids: torch.LongTensor, logits: torch.Tensor
) -> torch.LongTensor:
"""Select the next token from the candidate logits. """Select the next token from the candidate logits.
Args: Args:
@ -384,7 +401,9 @@ class NeuronGenerator(Generator):
f" Please align max_batch_size with the static batch size: {self.model.batch_size}." f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
) )
# Assign each request to an empty slot # Assign each request to an empty slot
logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)") logger.debug(
f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
)
new_slots = [] new_slots = []
for request in batch.requests: for request in batch.requests:
slot = empty_slots.pop() slot = empty_slots.pop()
@ -417,7 +436,11 @@ class NeuronGenerator(Generator):
max_length = slot.truncate max_length = slot.truncate
# Tokenize with padding and truncation # Tokenize with padding and truncation
padded_inputs = self.tokenizer( padded_inputs = self.tokenizer(
inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length inputs,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length,
) )
input_ids = padded_inputs.input_ids input_ids = padded_inputs.input_ids
attention_mask = padded_inputs.attention_mask attention_mask = padded_inputs.attention_mask
@ -450,9 +473,13 @@ class NeuronGenerator(Generator):
slot.reset(slot_input_ids, slot_attention_mask, selector) slot.reset(slot_input_ids, slot_attention_mask, selector)
# Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored, # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
# as they have already been generated and sent back in the last decode. # as they have already been generated and sent back in the last decode.
model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids) model_inputs = self.model.prepare_inputs_for_prefill(
input_ids, attention_mask, seq_ids
)
logits = self.model(**model_inputs)[0] logits = self.model(**model_inputs)[0]
generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids) generation, next_batch = self._generate_token(
prefill_slots, self.batch_id, logits, input_ids
)
self.batch_id += 1 self.batch_id += 1
# Reactivate previously active slots for the next decode # Reactivate previously active slots for the next decode
for i, slot in enumerate(active_slots): for i, slot in enumerate(active_slots):
@ -462,10 +489,14 @@ class NeuronGenerator(Generator):
slot.append(next_tokens[i]) slot.append(next_tokens[i])
logger.debug("Model ready for decoding") logger.debug("Model ready for decoding")
if next_batch is not None: if next_batch is not None:
logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}") logger.debug(
f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
)
return generation, next_batch return generation, next_batch
def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]: def decode(
self, batches: List[CachedBatch]
) -> Tuple[List[Generation], CachedBatch]:
"""Decode the specified prefilled requests. """Decode the specified prefilled requests.
Args: Args:
@ -491,10 +522,14 @@ class NeuronGenerator(Generator):
cleared_request_ids.append(slot.request_id) cleared_request_ids.append(slot.request_id)
slot.clear() slot.clear()
if len(cleared_request_ids) > 0: if len(cleared_request_ids) > 0:
logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.") logger.info(
f"Clearing slot for requests {cleared_request_ids} as they are not requested."
)
active_slots = [slot for slot in self.slots if slot.state == slot.State.READY] active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
if len(active_slots) < len(request_ids): if len(active_slots) < len(request_ids):
raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)") raise ValueError(
"Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
)
if self.model.continuous_batching: if self.model.continuous_batching:
decode_slots = active_slots decode_slots = active_slots
seq_ids = torch.tensor([slot.id for slot in decode_slots]) seq_ids = torch.tensor([slot.id for slot in decode_slots])
@ -503,7 +538,9 @@ class NeuronGenerator(Generator):
seq_ids = None seq_ids = None
# Reconstruct input_ids and attention_mask from decode slots # Reconstruct input_ids and attention_mask from decode slots
n_slots = len(decode_slots) n_slots = len(decode_slots)
input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64) input_ids = torch.full(
[n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
)
max_length = 0 max_length = 0
for slot in decode_slots: for slot in decode_slots:
max_length = max(max_length, slot.attention_mask.size(-1)) max_length = max(max_length, slot.attention_mask.size(-1))
@ -513,12 +550,18 @@ class NeuronGenerator(Generator):
# input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached) # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
input_ids[i, 0] = slot.next_token input_ids[i, 0] = slot.next_token
attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids) model_inputs = self.model.prepare_inputs_for_decode(
input_ids, attention_mask, seq_ids
)
logits = self.model(**model_inputs)[0] logits = self.model(**model_inputs)[0]
return self._generate_token(decode_slots, next_batch_id, logits, input_ids) return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
def _generate_token( def _generate_token(
self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor self,
slots: List[Slot],
next_batch_id: int,
logits: torch.Tensor,
input_ids: torch.LongTensor,
) -> Tuple[List[Generation], CachedBatch]: ) -> Tuple[List[Generation], CachedBatch]:
generations = [] generations = []
active_slots = False active_slots = False
@ -542,9 +585,13 @@ class NeuronGenerator(Generator):
if finish_reason is not None: if finish_reason is not None:
# We must include the generated text for each finished sequence in the response # We must include the generated text for each finished sequence in the response
generated_text = GeneratedText( generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason text=slot.generated_text,
generated_tokens=slot.generated_tokens,
finish_reason=finish_reason,
)
logger.debug(
f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
) )
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# mark the slot as available # mark the slot as available
slot.clear() slot.clear()
else: else:
@ -565,7 +612,9 @@ class NeuronGenerator(Generator):
batch = None batch = None
if active_slots: if active_slots:
# Whatever initial batch these requests came from, we always return all pending requests in a single batch # Whatever initial batch these requests came from, we always return all pending requests in a single batch
request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY] request_ids = [
slot.request_id for slot in self.slots if slot.state == Slot.State.READY
]
batch = self._cached_batch(next_batch_id, request_ids) batch = self._cached_batch(next_batch_id, request_ids)
else: else:
logger.debug("No more pending requests") logger.debug("No more pending requests")
@ -574,7 +623,9 @@ class NeuronGenerator(Generator):
def _cached_batch(self, batch_id: int, request_ids: List): def _cached_batch(self, batch_id: int, request_ids: List):
size = len(request_ids) size = len(request_ids)
max_tokens = size * self.model.max_length max_tokens = size * self.model.max_length
return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens) return CachedBatch(
id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
)
def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch: def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
"""Remove requests that are not listed from the specified batch """Remove requests that are not listed from the specified batch
@ -588,7 +639,9 @@ class NeuronGenerator(Generator):
Return: Return:
A `CachedBatch` containing the pending requests. A `CachedBatch` containing the pending requests.
""" """
keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids] keep_slot_ids = [
slot.id for slot in self.slots if slot.request_id in keep_request_ids
]
self._clear(keep_slot_ids) self._clear(keep_slot_ids)
return self._cached_batch(batch_id, keep_request_ids) return self._cached_batch(batch_id, keep_request_ids)
@ -625,11 +678,19 @@ class NeuronGenerator(Generator):
export_kwargs = get_export_kwargs_from_env() export_kwargs = get_export_kwargs_from_env()
logger.info(f"Exporting model to neuron with config: {export_kwargs}.") logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
model = NeuronModelForCausalLM.from_pretrained( model = NeuronModelForCausalLM.from_pretrained(
model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs model_id,
revision=revision,
low_cpu_mem_usage=True,
export=True,
**export_kwargs,
) )
else: else:
logger.info("Loading model on neuron devices (this can take a few minutes).") logger.info(
model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision) "Loading model on neuron devices (this can take a few minutes)."
)
model = NeuronModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, revision=revision
)
end = time.time() end = time.time()
logger.info(f"Model successfully loaded in {end - start:.2f} s.") logger.info(f"Model successfully loaded in {end - start:.2f} s.")
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

View File

@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
logger.exception(f"Method {method_name} encountered an error.") logger.exception(f"Method {method_name} encountered an error.")
await context.abort_with_status( await context.abort_with_status(
rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))) rpc_status.to_status(
status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
)
) )

View File

@ -56,7 +56,9 @@ def log_cache_size():
if os.path.exists(path): if os.path.exists(path):
usage = shutil.disk_usage(path) usage = shutil.disk_usage(path)
gb = 2**30 gb = 2**30
logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G") logger.info(
f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
)
else: else:
raise ValueError(f"The cache directory ({path}) does not exist.") raise ValueError(f"The cache directory ({path}) does not exist.")
@ -79,7 +81,9 @@ def fetch_model(
if not os.path.isdir("/sys/class/neuron_device/"): if not os.path.isdir("/sys/class/neuron_device/"):
raise SystemError("No neuron cores detected on the host.") raise SystemError("No neuron cores detected on the host.")
if os.path.isdir(model_id) and revision is not None: if os.path.isdir(model_id) and revision is not None:
logger.warning("Revision {} ignored for local model at {}".format(revision, model_id)) logger.warning(
"Revision {} ignored for local model at {}".format(revision, model_id)
)
revision = None revision = None
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model) # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
# Note that the model may already be present in the cache. # Note that the model may already be present in the cache.
@ -89,12 +93,16 @@ def fetch_model(
if os.path.isdir(model_id): if os.path.isdir(model_id):
return model_id return model_id
# Prefetch the neuron model from the Hub # Prefetch the neuron model from the Hub
logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}") logger.info(
f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
)
log_cache_size() log_cache_size()
return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin") return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
# Model needs to be exported: look for compatible cached entries on the hub # Model needs to be exported: look for compatible cached entries on the hub
export_kwargs = get_export_kwargs_from_env() export_kwargs = get_export_kwargs_from_env()
export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs) export_config = NeuronModelForCausalLM.get_export_config(
model_id, config, revision=revision, **export_kwargs
)
neuron_config = export_config.neuron neuron_config = export_config.neuron
if not is_cached(model_id, neuron_config): if not is_cached(model_id, neuron_config):
hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache" hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
@ -105,7 +113,9 @@ def fetch_model(
f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}" f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
) )
raise ValueError(error_msg) raise ValueError(error_msg)
logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.") logger.warning(
f"{model_id} is not a neuron model: it will be exported using cached artifacts."
)
if os.path.isdir(model_id): if os.path.isdir(model_id):
return model_id return model_id
# Prefetch weights, tokenizer and generation config so that they are in cache # Prefetch weights, tokenizer and generation config so that they are in cache

View File

@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
MODEL_CONFIGURATIONS = { MODEL_CONFIGURATIONS = {
"gpt2": { "gpt2": {
"model_id": "gpt2", "model_id": "gpt2",
"export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 1024,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"llama": { "llama": {
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
"export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 2048,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"mistral": { "mistral": {
"model_id": "optimum/mistral-1.1b-testing", "model_id": "optimum/mistral-1.1b-testing",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
}, },
"qwen2": { "qwen2": {
"model_id": "Qwen/Qwen2.5-0.5B", "model_id": "Qwen/Qwen2.5-0.5B",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"granite": { "granite": {
"model_id": "ibm-granite/granite-3.1-2b-instruct", "model_id": "ibm-granite/granite-3.1-2b-instruct",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
}, },
} }
def get_hub_neuron_model_id(config_name: str): def get_hub_neuron_model_id(config_name: str):
return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" return (
f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
)
def export_model(model_id, export_kwargs, neuron_model_path): def export_model(model_id, export_kwargs, neuron_model_path):
export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"] export_command = [
"optimum-cli",
"export",
"neuron",
"-m",
model_id,
"--task",
"text-generation",
]
for kwarg, value in export_kwargs.items(): for kwarg, value in export_kwargs.items():
export_command.append(f"--{kwarg}") export_command.append(f"--{kwarg}")
export_command.append(str(value)) export_command.append(str(value))

View File

@ -1,5 +1,3 @@
import os
from argparse import ArgumentParser from argparse import ArgumentParser
from huggingface_hub import HfApi from huggingface_hub import HfApi
@ -15,7 +13,7 @@ def main():
delete = True delete = True
else: else:
answer = input(f"Do you want to delete {model.id} [y/N] ?") answer = input(f"Do you want to delete {model.id} [y/N] ?")
delete = (answer == "y") delete = answer == "y"
if delete: if delete:
api.delete_repo(model.id) api.delete_repo(model.id)
print(f"Deleted {model.id}.") print(f"Deleted {model.id}.")

View File

@ -29,22 +29,42 @@ def create_request(
) )
stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens) stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
return Request( return Request(
id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters id=id,
inputs=inputs,
truncate=truncate,
parameters=parameters,
stopping_parameters=stopping_parameters,
) )
def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path): def check_prefill(
input_text,
expected_token_id,
expected_token_text,
do_sample,
batch_size,
model_path,
):
"""Verify that a prefill for a single request generates the expected output.""" """Verify that a prefill for a single request generates the expected output."""
generator = NeuronGenerator.from_pretrained(model_path) generator = NeuronGenerator.from_pretrained(model_path)
assert generator.model.batch_size >= batch_size assert generator.model.batch_size >= batch_size
requests = [] requests = []
max_new_tokens = 20 max_new_tokens = 20
for i in range(batch_size): for i in range(batch_size):
requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens)) requests.append(
create_request(
id=0,
inputs=input_text,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
)
)
# Let's be pessimistic when estimating max_tokens # Let's be pessimistic when estimating max_tokens
batch_size * (len(input_text) + max_new_tokens) batch_size * (len(input_text) + max_new_tokens)
max_length = generator.model.max_length max_length = generator.model.max_length
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) batch = Batch(
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
)
generations, next_batch = generator.prefill(batch) generations, next_batch = generator.prefill(batch)
assert next_batch.size == batch_size assert next_batch.size == batch_size
# Whatever was passed as max_tokens, the server will correct it # Whatever was passed as max_tokens, the server will correct it
@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample,
assert tokens.texts == [expected_token_text] assert tokens.texts == [expected_token_text]
def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path): def check_decode_single(
input_text, max_new_tokens, generated_text, do_sample, model_path
):
"""Verify that a decoding for a single request generates the expected output.""" """Verify that a decoding for a single request generates the expected output."""
generator = NeuronGenerator.from_pretrained(model_path) generator = NeuronGenerator.from_pretrained(model_path)
request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample) request = create_request(
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
)
max_length = generator.model.max_length max_length = generator.model.max_length
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
generations, next_batch = generator.prefill(batch) generations, next_batch = generator.prefill(batch)

View File

@ -16,9 +16,13 @@ def test_decode(neuron_model_config):
def _test_decode(config_name, generator, do_sample): def _test_decode(config_name, generator, do_sample):
input_text = "It was a bright cold day in April, and the clocks were striking thirteen." input_text = (
"It was a bright cold day in April, and the clocks were striking thirteen."
)
max_new_tokens = 20 max_new_tokens = 20
request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample) request = create_request(
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
)
max_length = generator.model.max_length max_length = generator.model.max_length
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
generations, next_batch = generator.prefill(batch) generations, next_batch = generator.prefill(batch)

View File

@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text):
slot.assign(0, request, GenerationConfig()) slot.assign(0, request, GenerationConfig())
assert slot.cached_text == input_text assert slot.cached_text == input_text
inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt") inputs = tokenizer(
input_text,
padding="max_length",
max_length=len(input_text) + 1,
return_tensors="pt",
)
input_ids = inputs["input_ids"][0] input_ids = inputs["input_ids"][0]
attention_mask = inputs["attention_mask"][0] attention_mask = inputs["attention_mask"][0]
generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"] generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]

View File

@ -21,12 +21,23 @@ def test_prefill(neuron_model_config):
def _test_prefill(config_name, generator, batch_size, do_sample): def _test_prefill(config_name, generator, batch_size, do_sample):
requests = [] requests = []
max_new_tokens = 20 max_new_tokens = 20
input_text = "It was a bright cold day in April, and the clocks were striking thirteen." input_text = (
"It was a bright cold day in April, and the clocks were striking thirteen."
)
for i in range(batch_size): for i in range(batch_size):
requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens)) requests.append(
create_request(
id=i,
inputs=input_text,
do_sample=do_sample,
max_new_tokens=max_new_tokens,
)
)
# Let's be pessimistic when estimating max_tokens # Let's be pessimistic when estimating max_tokens
max_length = generator.model.max_length max_length = generator.model.max_length
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) batch = Batch(
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
)
generations, next_batch = generator.prefill(batch) generations, next_batch = generator.prefill(batch)
assert next_batch.size == batch_size assert next_batch.size == batch_size
# Whatever was passed as max_tokens, the server will correct it # Whatever was passed as max_tokens, the server will correct it
@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config):
for i in range(batch_size): for i in range(batch_size):
requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i])) requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
max_length = generator.model.max_length max_length = generator.model.max_length
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length) batch = Batch(
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
)
generations, _ = generator.prefill(batch) generations, _ = generator.prefill(batch)
# Even if the input text is identical for all requests, the first generated token might # Even if the input text is identical for all requests, the first generated token might
# be different because of the truncation # be different because of the truncation

View File

@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"] tgi_router_env_vars = [
"MAX_BATCH_SIZE",
"MAX_TOTAL_TOKENS",
"MAX_INPUT_TOKENS",
"MAX_BATCH_PREFILL_TOKENS",
]
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"] tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
env_config_peering = [ env_config_peering = [
@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
argv = sys.argv argv = sys.argv
# All these are params passed to tgi and intercepted here # All these are params passed to tgi and intercepted here
parser.add_argument( parser.add_argument(
"--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)) "--max-input-tokens",
type=int,
default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
)
parser.add_argument(
"--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
)
parser.add_argument(
"--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
)
parser.add_argument(
"--max-batch-prefill-tokens",
type=int,
default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
) )
parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID")) parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
parser.add_argument("--revision", type=str, default=os.getenv("REVISION")) parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
args = parser.parse_known_args(argv)[0] args = parser.parse_known_args(argv)[0]
if not args.model_id: if not args.model_id:
raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var") raise Exception(
"No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
)
# Override env with cmdline params # Override env with cmdline params
os.environ["MODEL_ID"] = args.model_id os.environ["MODEL_ID"] = args.model_id
@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config):
f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens)) f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS") max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
if not max_batch_prefill_tokens: if not max_batch_prefill_tokens:
max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens) max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
max_input_tokens
)
f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens)) f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary):
return -dictionary["num_cores"], -dictionary["batch_size"] return -dictionary["num_cores"], -dictionary["batch_size"]
def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]: def lookup_compatible_cached_model(
model_id: str, revision: Optional[str]
) -> Optional[Dict[str, Any]]:
# Reuse the same mechanic as the one in use to configure the tgi server part # Reuse the same mechanic as the one in use to configure the tgi server part
# The only difference here is that we stay as flexible as possible on the compatibility part # The only difference here is that we stay as flexible as possible on the compatibility part
entries = get_hub_cached_entries(model_id, "inference") entries = get_hub_cached_entries(model_id, "inference")
logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision) logger.debug(
"Found %d cached entries for model %s, revision %s",
len(entries),
model_id,
revision,
)
all_compatible = [] all_compatible = []
for entry in entries: for entry in entries:
if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True): if check_env_and_neuron_config_compatibility(
entry, check_compiler_version=True
):
all_compatible.append(entry) all_compatible.append(entry)
if not all_compatible: if not all_compatible:
@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
return entry return entry
def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool: def check_env_and_neuron_config_compatibility(
neuron_config: Dict[str, Any], check_compiler_version: bool
) -> bool:
logger.debug( logger.debug(
"Checking the provided neuron config %s is compatible with the local setup and provided environment", "Checking the provided neuron config %s is compatible with the local setup and provided environment",
neuron_config, neuron_config,
@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
# Local setup compat checks # Local setup compat checks
if neuron_config["num_cores"] > available_cores: if neuron_config["num_cores"] > available_cores:
logger.debug("Not enough neuron cores available to run the provided neuron config") logger.debug(
"Not enough neuron cores available to run the provided neuron config"
)
return False return False
if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version: if (
check_compiler_version
and neuron_config["compiler_version"] != neuronxcc_version
):
logger.debug( logger.debug(
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)", "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
neuronxcc_version, neuronxcc_version,
@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
) )
return False return False
max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))) max_input_tokens = int(
os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
)
if max_input_tokens > 0: if max_input_tokens > 0:
sequence_length = neuron_config["sequence_length"] sequence_length = neuron_config["sequence_length"]
if max_input_tokens >= sequence_length: if max_input_tokens >= sequence_length:
@ -191,7 +228,10 @@ def main():
if not os.getenv(env_var): if not os.getenv(env_var):
break break
else: else:
logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars) logger.info(
"All env vars %s already set, skipping, user know what they are doing",
env_vars,
)
sys.exit(0) sys.exit(0)
cache_dir = constants.HF_HUB_CACHE cache_dir = constants.HF_HUB_CACHE
@ -201,7 +241,9 @@ def main():
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision) config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
neuron_config = getattr(config, "neuron", None) neuron_config = getattr(config, "neuron", None)
if neuron_config is not None: if neuron_config is not None:
compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False) compatible = check_env_and_neuron_config_compatibility(
neuron_config, check_compiler_version=False
)
if not compatible: if not compatible:
env_dict = get_env_dict() env_dict = get_env_dict()
msg = ( msg = (
@ -213,9 +255,9 @@ def main():
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision) neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
if not neuron_config: if not neuron_config:
msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format( msg = (
get_env_dict(), available_cores, neuronxcc_version "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
) ).format(get_env_dict(), available_cores, neuronxcc_version)
logger.error(msg) logger.error(msg)
raise Exception(msg) raise Exception(msg)

View File

@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items):
def skip_release(item): def skip_release(item):
if "release" in item.keywords: if "release" in item.keywords:
item.add_marker(pytest.mark.skip(reason="need --release option to run")) item.add_marker(pytest.mark.skip(reason="need --release option to run"))
selectors.append(skip_release) selectors.append(skip_release)
if config.getoption("--neuron"): if config.getoption("--neuron"):
def skip_not_neuron(item): def skip_not_neuron(item):
if "neuron" not in item.keywords: if "neuron" not in item.keywords:
item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option")) item.add_marker(
pytest.mark.skip(reason="incompatible with --neuron option")
)
selectors.append(skip_not_neuron) selectors.append(skip_not_neuron)
else: else:
def skip_neuron(item): def skip_neuron(item):
if "neuron" in item.keywords: if "neuron" in item.keywords:
item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
selectors.append(skip_neuron) selectors.append(skip_neuron)
for item in items: for item in items:
for selector in selectors: for selector in selectors:

View File

@ -30,44 +30,74 @@ logger = logging.getLogger(__file__)
MODEL_CONFIGURATIONS = { MODEL_CONFIGURATIONS = {
"gpt2": { "gpt2": {
"model_id": "gpt2", "model_id": "gpt2",
"export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 1024,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"llama": { "llama": {
"model_id": "unsloth/Llama-3.2-1B-Instruct", "model_id": "unsloth/Llama-3.2-1B-Instruct",
"export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 2048,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"mistral": { "mistral": {
"model_id": "optimum/mistral-1.1b-testing", "model_id": "optimum/mistral-1.1b-testing",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
}, },
"qwen2": { "qwen2": {
"model_id": "Qwen/Qwen2.5-0.5B", "model_id": "Qwen/Qwen2.5-0.5B",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "fp16",
},
}, },
"granite": { "granite": {
"model_id": "ibm-granite/granite-3.1-2b-instruct", "model_id": "ibm-granite/granite-3.1-2b-instruct",
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, "export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
}, },
} }
def get_neuron_backend_hash(): def get_neuron_backend_hash():
import subprocess import subprocess
res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
capture_output=True, res = subprocess.run(
text=True) ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
root_dir = res.stdout.split('\n')[0] )
root_dir = res.stdout.split("\n")[0]
def get_sha(path): def get_sha(path):
res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"], res = subprocess.run(
["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
capture_output=True, capture_output=True,
text=True) text=True,
)
# Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n' # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
sha = res.stdout.split('\t')[0].split(' ')[-1] sha = res.stdout.split("\t")[0].split(" ")[-1]
return sha.encode() return sha.encode()
# We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
m = hashlib.sha256() m = hashlib.sha256()
m.update(get_sha('backends/neuron')) m.update(get_sha("backends/neuron"))
m.update(get_sha('Dockerfile.neuron')) m.update(get_sha("Dockerfile.neuron"))
return m.hexdigest()[:10] return m.hexdigest()[:10]
@ -81,7 +111,9 @@ def get_tgi_docker_image():
client = docker.from_env() client = docker.from_env()
images = client.images.list(filters={"reference": "text-generation-inference"}) images = client.images.list(filters={"reference": "text-generation-inference"})
if not images: if not images:
raise ValueError("No text-generation-inference image found on this host to run tests.") raise ValueError(
"No text-generation-inference image found on this host to run tests."
)
docker_image = images[0].tags[0] docker_image = images[0].tags[0]
return docker_image return docker_image
@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name):
with tempfile.TemporaryDirectory() as context_dir: with tempfile.TemporaryDirectory() as context_dir:
# Create entrypoint # Create entrypoint
model_path = "/data/neuron_model" model_path = "/data/neuron_model"
export_command = f"optimum-cli export neuron -m {model_id} --task text-generation" export_command = (
f"optimum-cli export neuron -m {model_id} --task text-generation"
)
for kwarg, value in export_kwargs.items(): for kwarg, value in export_kwargs.items():
export_command += f" --{kwarg} {str(value)}" export_command += f" --{kwarg} {str(value)}"
export_command += f" {model_path}" export_command += f" {model_path}"
@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name):
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f: with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
f.write(docker_content.encode("utf-8")) f.write(docker_content.encode("utf-8"))
f.flush() f.flush()
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image) image, logs = client.images.build(
path=context_dir, dockerfile=f.name, tag=export_image
)
logger.info("Successfully built image %s", image.id) logger.info("Successfully built image %s", image.id)
logger.debug("Build logs %s", logs) logger.debug("Build logs %s", logs)

View File

@ -27,7 +27,9 @@ def get_tgi_docker_image():
client = docker.from_env() client = docker.from_env()
images = client.images.list(filters={"reference": "text-generation-inference"}) images = client.images.list(filters={"reference": "text-generation-inference"})
if not images: if not images:
raise ValueError("No text-generation-inference image found on this host to run tests.") raise ValueError(
"No text-generation-inference image found on this host to run tests."
)
docker_image = images[0].tags[0] docker_image = images[0].tags[0]
return docker_image return docker_image
@ -131,13 +133,21 @@ def neuron_launcher(event_loop):
except NotFound: except NotFound:
pass pass
env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID} env = {
"LOG_LEVEL": "info,text_generation_router=debug",
"CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
}
if HF_TOKEN is not None: if HF_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
env["HF_TOKEN"] = HF_TOKEN env["HF_TOKEN"] = HF_TOKEN
for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]: for var in [
"MAX_BATCH_SIZE",
"MAX_TOTAL_TOKENS",
"HF_AUTO_CAST_TYPE",
"HF_NUM_CORES",
]:
if var in os.environ: if var in os.environ:
env[var] = os.environ[var] env[var] = os.environ[var]
@ -165,7 +175,9 @@ def neuron_launcher(event_loop):
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f: with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
f.write(docker_content.encode("utf-8")) f.write(docker_content.encode("utf-8"))
f.flush() f.flush()
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image) image, logs = client.images.build(
path=context_dir, dockerfile=f.name, tag=test_image
)
logger.info("Successfully built image %s", image.id) logger.info("Successfully built image %s", image.id)
logger.debug("Build logs %s", logs) logger.debug("Build logs %s", logs)
else: else:
@ -204,7 +216,9 @@ def neuron_launcher(event_loop):
try: try:
container.remove(force=True) container.remove(force=True)
except Exception as e: except Exception as e:
logger.error("Error while removing container %s, skipping", container_name) logger.error(
"Error while removing container %s, skipping", container_name
)
logger.exception(e) logger.exception(e)
# Cleanup the build image # Cleanup the build image
@ -243,7 +257,12 @@ def neuron_generate_load():
client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
) -> List[TextGenerationOutput]: ) -> List[TextGenerationOutput]:
futures = [ futures = [
client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True) client.text_generation(
prompt,
max_new_tokens=max_new_tokens,
details=True,
decoder_input_details=True,
)
for _ in range(n) for _ in range(n)
] ]

View File

@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service):
# Greedy bounded with input # Greedy bounded with input
response = await tgi_service.client.text_generation( response = await tgi_service.client.text_generation(
"What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True "What is Deep Learning?",
max_new_tokens=17,
return_full_text=True,
details=True,
decoder_input_details=True,
) )
assert response.details.generated_tokens == 17 assert response.details.generated_tokens == 17
assert response.generated_text == prompt + greedy_expectations[service_name] assert response.generated_text == prompt + greedy_expectations[service_name]

View File

@ -1,7 +1,6 @@
import os import os
import pytest import pytest
from huggingface_hub.errors import ValidationError
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"]) @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])