mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
fix: run linters and fix formatting (#3057)
This commit is contained in:
parent
d7a24c03cf
commit
b0069e0485
@ -61,7 +61,9 @@ def serve(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if trust_remote_code is not None:
|
if trust_remote_code is not None:
|
||||||
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
logger.warning(
|
||||||
|
"'trust_remote_code' argument is not supported and will be ignored."
|
||||||
|
)
|
||||||
|
|
||||||
# Import here after the logger is added to log potential import exceptions
|
# Import here after the logger is added to log potential import exceptions
|
||||||
from .server import serve
|
from .server import serve
|
||||||
@ -99,7 +101,9 @@ def download_weights(
|
|||||||
if extension is not None:
|
if extension is not None:
|
||||||
logger.warning("'extension' argument is not supported and will be ignored.")
|
logger.warning("'extension' argument is not supported and will be ignored.")
|
||||||
if trust_remote_code is not None:
|
if trust_remote_code is not None:
|
||||||
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
logger.warning(
|
||||||
|
"'trust_remote_code' argument is not supported and will be ignored."
|
||||||
|
)
|
||||||
if auto_convert is not None:
|
if auto_convert is not None:
|
||||||
logger.warning("'auto_convert' argument is not supported and will be ignored.")
|
logger.warning("'auto_convert' argument is not supported and will be ignored.")
|
||||||
if merge_lora is not None:
|
if merge_lora is not None:
|
||||||
|
@ -146,7 +146,9 @@ class Slot:
|
|||||||
def generated_tokens(self) -> int:
|
def generated_tokens(self) -> int:
|
||||||
return self._generated_tokens
|
return self._generated_tokens
|
||||||
|
|
||||||
def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
|
def assign(
|
||||||
|
self, batch_id: int, request: Request, generation_config: GenerationConfig
|
||||||
|
):
|
||||||
"""Assign a request to a slot.
|
"""Assign a request to a slot.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -174,15 +176,24 @@ class Slot:
|
|||||||
if request.parameters.typical_p != 0:
|
if request.parameters.typical_p != 0:
|
||||||
self._generation_config.typical_p = request.parameters.typical_p
|
self._generation_config.typical_p = request.parameters.typical_p
|
||||||
if request.parameters.repetition_penalty != 0:
|
if request.parameters.repetition_penalty != 0:
|
||||||
self._generation_config.repetition_penalty = request.parameters.repetition_penalty
|
self._generation_config.repetition_penalty = (
|
||||||
|
request.parameters.repetition_penalty
|
||||||
|
)
|
||||||
self.seed = request.parameters.seed
|
self.seed = request.parameters.seed
|
||||||
self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
|
self._generation_config.max_new_tokens = (
|
||||||
|
request.stopping_parameters.max_new_tokens
|
||||||
|
)
|
||||||
self._max_new_tokens = self._generation_config.max_new_tokens
|
self._max_new_tokens = self._generation_config.max_new_tokens
|
||||||
stop_strings = request.stopping_parameters.stop_sequences
|
stop_strings = request.stopping_parameters.stop_sequences
|
||||||
if stop_strings:
|
if stop_strings:
|
||||||
self._generation_config.stop_strings = stop_strings
|
self._generation_config.stop_strings = stop_strings
|
||||||
|
|
||||||
def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
|
def reset(
|
||||||
|
self,
|
||||||
|
input_ids: torch.LongTensor,
|
||||||
|
attention_mask: torch.LongTensor,
|
||||||
|
selector: TokenSelector,
|
||||||
|
):
|
||||||
"""Reset the slot for the next generation.
|
"""Reset the slot for the next generation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -210,7 +221,9 @@ class Slot:
|
|||||||
self._generated_tokens -= 1
|
self._generated_tokens -= 1
|
||||||
# Since generated tokens are now part of the prefill, we need to reevaluate
|
# Since generated tokens are now part of the prefill, we need to reevaluate
|
||||||
# max_new_tokens for the next generation
|
# max_new_tokens for the next generation
|
||||||
self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
|
self._generation_config.max_new_tokens = (
|
||||||
|
self._max_new_tokens - self._generated_tokens
|
||||||
|
)
|
||||||
self._state = Slot.State.PAUSE
|
self._state = Slot.State.PAUSE
|
||||||
|
|
||||||
def resume(self):
|
def resume(self):
|
||||||
@ -223,7 +236,9 @@ class Slot:
|
|||||||
"""Hack to hopefully support generate_stream for the maximum number of tokenizers"""
|
"""Hack to hopefully support generate_stream for the maximum number of tokenizers"""
|
||||||
# We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
|
# We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
|
||||||
# which decide to add a space or not depending on the surrounding ids.
|
# which decide to add a space or not depending on the surrounding ids.
|
||||||
new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
|
new_text = self._tokenizer.decode(
|
||||||
|
self._tokens[self._next_text_token_start :], skip_special_tokens=False
|
||||||
|
)
|
||||||
if new_text.endswith("<EFBFBD>"):
|
if new_text.endswith("<EFBFBD>"):
|
||||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||||
# from byte fallback tokenization.
|
# from byte fallback tokenization.
|
||||||
@ -267,7 +282,9 @@ class Slot:
|
|||||||
self._next_text = next_text
|
self._next_text = next_text
|
||||||
return next_text
|
return next_text
|
||||||
|
|
||||||
def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
|
def select(
|
||||||
|
self, input_ids: torch.LongTensor, logits: torch.Tensor
|
||||||
|
) -> torch.LongTensor:
|
||||||
"""Select the next token from the candidate logits.
|
"""Select the next token from the candidate logits.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -384,7 +401,9 @@ class NeuronGenerator(Generator):
|
|||||||
f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
|
f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
|
||||||
)
|
)
|
||||||
# Assign each request to an empty slot
|
# Assign each request to an empty slot
|
||||||
logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
|
logger.debug(
|
||||||
|
f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
|
||||||
|
)
|
||||||
new_slots = []
|
new_slots = []
|
||||||
for request in batch.requests:
|
for request in batch.requests:
|
||||||
slot = empty_slots.pop()
|
slot = empty_slots.pop()
|
||||||
@ -417,7 +436,11 @@ class NeuronGenerator(Generator):
|
|||||||
max_length = slot.truncate
|
max_length = slot.truncate
|
||||||
# Tokenize with padding and truncation
|
# Tokenize with padding and truncation
|
||||||
padded_inputs = self.tokenizer(
|
padded_inputs = self.tokenizer(
|
||||||
inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
|
inputs,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding=True,
|
||||||
|
truncation=True,
|
||||||
|
max_length=max_length,
|
||||||
)
|
)
|
||||||
input_ids = padded_inputs.input_ids
|
input_ids = padded_inputs.input_ids
|
||||||
attention_mask = padded_inputs.attention_mask
|
attention_mask = padded_inputs.attention_mask
|
||||||
@ -450,9 +473,13 @@ class NeuronGenerator(Generator):
|
|||||||
slot.reset(slot_input_ids, slot_attention_mask, selector)
|
slot.reset(slot_input_ids, slot_attention_mask, selector)
|
||||||
# Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
|
# Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
|
||||||
# as they have already been generated and sent back in the last decode.
|
# as they have already been generated and sent back in the last decode.
|
||||||
model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
|
model_inputs = self.model.prepare_inputs_for_prefill(
|
||||||
|
input_ids, attention_mask, seq_ids
|
||||||
|
)
|
||||||
logits = self.model(**model_inputs)[0]
|
logits = self.model(**model_inputs)[0]
|
||||||
generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
|
generation, next_batch = self._generate_token(
|
||||||
|
prefill_slots, self.batch_id, logits, input_ids
|
||||||
|
)
|
||||||
self.batch_id += 1
|
self.batch_id += 1
|
||||||
# Reactivate previously active slots for the next decode
|
# Reactivate previously active slots for the next decode
|
||||||
for i, slot in enumerate(active_slots):
|
for i, slot in enumerate(active_slots):
|
||||||
@ -462,10 +489,14 @@ class NeuronGenerator(Generator):
|
|||||||
slot.append(next_tokens[i])
|
slot.append(next_tokens[i])
|
||||||
logger.debug("Model ready for decoding")
|
logger.debug("Model ready for decoding")
|
||||||
if next_batch is not None:
|
if next_batch is not None:
|
||||||
logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
|
logger.debug(
|
||||||
|
f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
|
||||||
|
)
|
||||||
return generation, next_batch
|
return generation, next_batch
|
||||||
|
|
||||||
def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
|
def decode(
|
||||||
|
self, batches: List[CachedBatch]
|
||||||
|
) -> Tuple[List[Generation], CachedBatch]:
|
||||||
"""Decode the specified prefilled requests.
|
"""Decode the specified prefilled requests.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -491,10 +522,14 @@ class NeuronGenerator(Generator):
|
|||||||
cleared_request_ids.append(slot.request_id)
|
cleared_request_ids.append(slot.request_id)
|
||||||
slot.clear()
|
slot.clear()
|
||||||
if len(cleared_request_ids) > 0:
|
if len(cleared_request_ids) > 0:
|
||||||
logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
|
logger.info(
|
||||||
|
f"Clearing slot for requests {cleared_request_ids} as they are not requested."
|
||||||
|
)
|
||||||
active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
|
active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
|
||||||
if len(active_slots) < len(request_ids):
|
if len(active_slots) < len(request_ids):
|
||||||
raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
|
raise ValueError(
|
||||||
|
"Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
|
||||||
|
)
|
||||||
if self.model.continuous_batching:
|
if self.model.continuous_batching:
|
||||||
decode_slots = active_slots
|
decode_slots = active_slots
|
||||||
seq_ids = torch.tensor([slot.id for slot in decode_slots])
|
seq_ids = torch.tensor([slot.id for slot in decode_slots])
|
||||||
@ -503,7 +538,9 @@ class NeuronGenerator(Generator):
|
|||||||
seq_ids = None
|
seq_ids = None
|
||||||
# Reconstruct input_ids and attention_mask from decode slots
|
# Reconstruct input_ids and attention_mask from decode slots
|
||||||
n_slots = len(decode_slots)
|
n_slots = len(decode_slots)
|
||||||
input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
|
input_ids = torch.full(
|
||||||
|
[n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
|
||||||
|
)
|
||||||
max_length = 0
|
max_length = 0
|
||||||
for slot in decode_slots:
|
for slot in decode_slots:
|
||||||
max_length = max(max_length, slot.attention_mask.size(-1))
|
max_length = max(max_length, slot.attention_mask.size(-1))
|
||||||
@ -513,12 +550,18 @@ class NeuronGenerator(Generator):
|
|||||||
# input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
|
# input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
|
||||||
input_ids[i, 0] = slot.next_token
|
input_ids[i, 0] = slot.next_token
|
||||||
attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
|
attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
|
||||||
model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
|
model_inputs = self.model.prepare_inputs_for_decode(
|
||||||
|
input_ids, attention_mask, seq_ids
|
||||||
|
)
|
||||||
logits = self.model(**model_inputs)[0]
|
logits = self.model(**model_inputs)[0]
|
||||||
return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
|
return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
|
||||||
|
|
||||||
def _generate_token(
|
def _generate_token(
|
||||||
self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
|
self,
|
||||||
|
slots: List[Slot],
|
||||||
|
next_batch_id: int,
|
||||||
|
logits: torch.Tensor,
|
||||||
|
input_ids: torch.LongTensor,
|
||||||
) -> Tuple[List[Generation], CachedBatch]:
|
) -> Tuple[List[Generation], CachedBatch]:
|
||||||
generations = []
|
generations = []
|
||||||
active_slots = False
|
active_slots = False
|
||||||
@ -542,9 +585,13 @@ class NeuronGenerator(Generator):
|
|||||||
if finish_reason is not None:
|
if finish_reason is not None:
|
||||||
# We must include the generated text for each finished sequence in the response
|
# We must include the generated text for each finished sequence in the response
|
||||||
generated_text = GeneratedText(
|
generated_text = GeneratedText(
|
||||||
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
|
text=slot.generated_text,
|
||||||
|
generated_tokens=slot.generated_tokens,
|
||||||
|
finish_reason=finish_reason,
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
|
||||||
)
|
)
|
||||||
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
|
|
||||||
# mark the slot as available
|
# mark the slot as available
|
||||||
slot.clear()
|
slot.clear()
|
||||||
else:
|
else:
|
||||||
@ -565,7 +612,9 @@ class NeuronGenerator(Generator):
|
|||||||
batch = None
|
batch = None
|
||||||
if active_slots:
|
if active_slots:
|
||||||
# Whatever initial batch these requests came from, we always return all pending requests in a single batch
|
# Whatever initial batch these requests came from, we always return all pending requests in a single batch
|
||||||
request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
|
request_ids = [
|
||||||
|
slot.request_id for slot in self.slots if slot.state == Slot.State.READY
|
||||||
|
]
|
||||||
batch = self._cached_batch(next_batch_id, request_ids)
|
batch = self._cached_batch(next_batch_id, request_ids)
|
||||||
else:
|
else:
|
||||||
logger.debug("No more pending requests")
|
logger.debug("No more pending requests")
|
||||||
@ -574,7 +623,9 @@ class NeuronGenerator(Generator):
|
|||||||
def _cached_batch(self, batch_id: int, request_ids: List):
|
def _cached_batch(self, batch_id: int, request_ids: List):
|
||||||
size = len(request_ids)
|
size = len(request_ids)
|
||||||
max_tokens = size * self.model.max_length
|
max_tokens = size * self.model.max_length
|
||||||
return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
|
return CachedBatch(
|
||||||
|
id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
|
||||||
|
)
|
||||||
|
|
||||||
def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
|
def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
|
||||||
"""Remove requests that are not listed from the specified batch
|
"""Remove requests that are not listed from the specified batch
|
||||||
@ -588,7 +639,9 @@ class NeuronGenerator(Generator):
|
|||||||
Return:
|
Return:
|
||||||
A `CachedBatch` containing the pending requests.
|
A `CachedBatch` containing the pending requests.
|
||||||
"""
|
"""
|
||||||
keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
|
keep_slot_ids = [
|
||||||
|
slot.id for slot in self.slots if slot.request_id in keep_request_ids
|
||||||
|
]
|
||||||
self._clear(keep_slot_ids)
|
self._clear(keep_slot_ids)
|
||||||
return self._cached_batch(batch_id, keep_request_ids)
|
return self._cached_batch(batch_id, keep_request_ids)
|
||||||
|
|
||||||
@ -625,11 +678,19 @@ class NeuronGenerator(Generator):
|
|||||||
export_kwargs = get_export_kwargs_from_env()
|
export_kwargs = get_export_kwargs_from_env()
|
||||||
logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
|
logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
|
||||||
model = NeuronModelForCausalLM.from_pretrained(
|
model = NeuronModelForCausalLM.from_pretrained(
|
||||||
model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
|
model_id,
|
||||||
|
revision=revision,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
export=True,
|
||||||
|
**export_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info("Loading model on neuron devices (this can take a few minutes).")
|
logger.info(
|
||||||
model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
|
"Loading model on neuron devices (this can take a few minutes)."
|
||||||
|
)
|
||||||
|
model = NeuronModelForCausalLM.from_pretrained(
|
||||||
|
model_id, low_cpu_mem_usage=True, revision=revision
|
||||||
|
)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.info(f"Model successfully loaded in {end - start:.2f} s.")
|
logger.info(f"Model successfully loaded in {end - start:.2f} s.")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
||||||
|
@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
|
|||||||
logger.exception(f"Method {method_name} encountered an error.")
|
logger.exception(f"Method {method_name} encountered an error.")
|
||||||
|
|
||||||
await context.abort_with_status(
|
await context.abort_with_status(
|
||||||
rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
|
rpc_status.to_status(
|
||||||
|
status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
@ -56,7 +56,9 @@ def log_cache_size():
|
|||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
usage = shutil.disk_usage(path)
|
usage = shutil.disk_usage(path)
|
||||||
gb = 2**30
|
gb = 2**30
|
||||||
logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
|
logger.info(
|
||||||
|
f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"The cache directory ({path}) does not exist.")
|
raise ValueError(f"The cache directory ({path}) does not exist.")
|
||||||
|
|
||||||
@ -79,7 +81,9 @@ def fetch_model(
|
|||||||
if not os.path.isdir("/sys/class/neuron_device/"):
|
if not os.path.isdir("/sys/class/neuron_device/"):
|
||||||
raise SystemError("No neuron cores detected on the host.")
|
raise SystemError("No neuron cores detected on the host.")
|
||||||
if os.path.isdir(model_id) and revision is not None:
|
if os.path.isdir(model_id) and revision is not None:
|
||||||
logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
|
logger.warning(
|
||||||
|
"Revision {} ignored for local model at {}".format(revision, model_id)
|
||||||
|
)
|
||||||
revision = None
|
revision = None
|
||||||
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
|
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
|
||||||
# Note that the model may already be present in the cache.
|
# Note that the model may already be present in the cache.
|
||||||
@ -89,12 +93,16 @@ def fetch_model(
|
|||||||
if os.path.isdir(model_id):
|
if os.path.isdir(model_id):
|
||||||
return model_id
|
return model_id
|
||||||
# Prefetch the neuron model from the Hub
|
# Prefetch the neuron model from the Hub
|
||||||
logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
|
logger.info(
|
||||||
|
f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
|
||||||
|
)
|
||||||
log_cache_size()
|
log_cache_size()
|
||||||
return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
|
return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
|
||||||
# Model needs to be exported: look for compatible cached entries on the hub
|
# Model needs to be exported: look for compatible cached entries on the hub
|
||||||
export_kwargs = get_export_kwargs_from_env()
|
export_kwargs = get_export_kwargs_from_env()
|
||||||
export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
|
export_config = NeuronModelForCausalLM.get_export_config(
|
||||||
|
model_id, config, revision=revision, **export_kwargs
|
||||||
|
)
|
||||||
neuron_config = export_config.neuron
|
neuron_config = export_config.neuron
|
||||||
if not is_cached(model_id, neuron_config):
|
if not is_cached(model_id, neuron_config):
|
||||||
hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
|
hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
|
||||||
@ -105,7 +113,9 @@ def fetch_model(
|
|||||||
f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
|
f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
|
||||||
)
|
)
|
||||||
raise ValueError(error_msg)
|
raise ValueError(error_msg)
|
||||||
logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
|
logger.warning(
|
||||||
|
f"{model_id} is not a neuron model: it will be exported using cached artifacts."
|
||||||
|
)
|
||||||
if os.path.isdir(model_id):
|
if os.path.isdir(model_id):
|
||||||
return model_id
|
return model_id
|
||||||
# Prefetch weights, tokenizer and generation config so that they are in cache
|
# Prefetch weights, tokenizer and generation config so that they are in cache
|
||||||
|
49
backends/neuron/tests/fixtures/model.py
vendored
49
backends/neuron/tests/fixtures/model.py
vendored
@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
|||||||
MODEL_CONFIGURATIONS = {
|
MODEL_CONFIGURATIONS = {
|
||||||
"gpt2": {
|
"gpt2": {
|
||||||
"model_id": "gpt2",
|
"model_id": "gpt2",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 1024,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"llama": {
|
"llama": {
|
||||||
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
|
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 2048,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"mistral": {
|
"mistral": {
|
||||||
"model_id": "optimum/mistral-1.1b-testing",
|
"model_id": "optimum/mistral-1.1b-testing",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "bf16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"qwen2": {
|
"qwen2": {
|
||||||
"model_id": "Qwen/Qwen2.5-0.5B",
|
"model_id": "Qwen/Qwen2.5-0.5B",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"granite": {
|
"granite": {
|
||||||
"model_id": "ibm-granite/granite-3.1-2b-instruct",
|
"model_id": "ibm-granite/granite-3.1-2b-instruct",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "bf16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_hub_neuron_model_id(config_name: str):
|
def get_hub_neuron_model_id(config_name: str):
|
||||||
return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
|
return (
|
||||||
|
f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def export_model(model_id, export_kwargs, neuron_model_path):
|
def export_model(model_id, export_kwargs, neuron_model_path):
|
||||||
export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
|
export_command = [
|
||||||
|
"optimum-cli",
|
||||||
|
"export",
|
||||||
|
"neuron",
|
||||||
|
"-m",
|
||||||
|
model_id,
|
||||||
|
"--task",
|
||||||
|
"text-generation",
|
||||||
|
]
|
||||||
for kwarg, value in export_kwargs.items():
|
for kwarg, value in export_kwargs.items():
|
||||||
export_command.append(f"--{kwarg}")
|
export_command.append(f"--{kwarg}")
|
||||||
export_command.append(str(value))
|
export_command.append(str(value))
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from huggingface_hub import HfApi
|
from huggingface_hub import HfApi
|
||||||
|
|
||||||
@ -15,7 +13,7 @@ def main():
|
|||||||
delete = True
|
delete = True
|
||||||
else:
|
else:
|
||||||
answer = input(f"Do you want to delete {model.id} [y/N] ?")
|
answer = input(f"Do you want to delete {model.id} [y/N] ?")
|
||||||
delete = (answer == "y")
|
delete = answer == "y"
|
||||||
if delete:
|
if delete:
|
||||||
api.delete_repo(model.id)
|
api.delete_repo(model.id)
|
||||||
print(f"Deleted {model.id}.")
|
print(f"Deleted {model.id}.")
|
||||||
|
@ -29,22 +29,42 @@ def create_request(
|
|||||||
)
|
)
|
||||||
stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
|
stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
|
||||||
return Request(
|
return Request(
|
||||||
id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
|
id=id,
|
||||||
|
inputs=inputs,
|
||||||
|
truncate=truncate,
|
||||||
|
parameters=parameters,
|
||||||
|
stopping_parameters=stopping_parameters,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
|
def check_prefill(
|
||||||
|
input_text,
|
||||||
|
expected_token_id,
|
||||||
|
expected_token_text,
|
||||||
|
do_sample,
|
||||||
|
batch_size,
|
||||||
|
model_path,
|
||||||
|
):
|
||||||
"""Verify that a prefill for a single request generates the expected output."""
|
"""Verify that a prefill for a single request generates the expected output."""
|
||||||
generator = NeuronGenerator.from_pretrained(model_path)
|
generator = NeuronGenerator.from_pretrained(model_path)
|
||||||
assert generator.model.batch_size >= batch_size
|
assert generator.model.batch_size >= batch_size
|
||||||
requests = []
|
requests = []
|
||||||
max_new_tokens = 20
|
max_new_tokens = 20
|
||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
|
requests.append(
|
||||||
|
create_request(
|
||||||
|
id=0,
|
||||||
|
inputs=input_text,
|
||||||
|
do_sample=do_sample,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
)
|
||||||
|
)
|
||||||
# Let's be pessimistic when estimating max_tokens
|
# Let's be pessimistic when estimating max_tokens
|
||||||
batch_size * (len(input_text) + max_new_tokens)
|
batch_size * (len(input_text) + max_new_tokens)
|
||||||
max_length = generator.model.max_length
|
max_length = generator.model.max_length
|
||||||
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
|
batch = Batch(
|
||||||
|
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
|
||||||
|
)
|
||||||
generations, next_batch = generator.prefill(batch)
|
generations, next_batch = generator.prefill(batch)
|
||||||
assert next_batch.size == batch_size
|
assert next_batch.size == batch_size
|
||||||
# Whatever was passed as max_tokens, the server will correct it
|
# Whatever was passed as max_tokens, the server will correct it
|
||||||
@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample,
|
|||||||
assert tokens.texts == [expected_token_text]
|
assert tokens.texts == [expected_token_text]
|
||||||
|
|
||||||
|
|
||||||
def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
|
def check_decode_single(
|
||||||
|
input_text, max_new_tokens, generated_text, do_sample, model_path
|
||||||
|
):
|
||||||
"""Verify that a decoding for a single request generates the expected output."""
|
"""Verify that a decoding for a single request generates the expected output."""
|
||||||
generator = NeuronGenerator.from_pretrained(model_path)
|
generator = NeuronGenerator.from_pretrained(model_path)
|
||||||
request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
|
request = create_request(
|
||||||
|
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
|
||||||
|
)
|
||||||
max_length = generator.model.max_length
|
max_length = generator.model.max_length
|
||||||
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
|
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
|
||||||
generations, next_batch = generator.prefill(batch)
|
generations, next_batch = generator.prefill(batch)
|
||||||
|
@ -16,9 +16,13 @@ def test_decode(neuron_model_config):
|
|||||||
|
|
||||||
|
|
||||||
def _test_decode(config_name, generator, do_sample):
|
def _test_decode(config_name, generator, do_sample):
|
||||||
input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
|
input_text = (
|
||||||
|
"It was a bright cold day in April, and the clocks were striking thirteen."
|
||||||
|
)
|
||||||
max_new_tokens = 20
|
max_new_tokens = 20
|
||||||
request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
|
request = create_request(
|
||||||
|
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
|
||||||
|
)
|
||||||
max_length = generator.model.max_length
|
max_length = generator.model.max_length
|
||||||
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
|
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
|
||||||
generations, next_batch = generator.prefill(batch)
|
generations, next_batch = generator.prefill(batch)
|
||||||
|
@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text):
|
|||||||
slot.assign(0, request, GenerationConfig())
|
slot.assign(0, request, GenerationConfig())
|
||||||
assert slot.cached_text == input_text
|
assert slot.cached_text == input_text
|
||||||
|
|
||||||
inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
|
inputs = tokenizer(
|
||||||
|
input_text,
|
||||||
|
padding="max_length",
|
||||||
|
max_length=len(input_text) + 1,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
input_ids = inputs["input_ids"][0]
|
input_ids = inputs["input_ids"][0]
|
||||||
attention_mask = inputs["attention_mask"][0]
|
attention_mask = inputs["attention_mask"][0]
|
||||||
generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
|
generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
|
||||||
|
@ -21,12 +21,23 @@ def test_prefill(neuron_model_config):
|
|||||||
def _test_prefill(config_name, generator, batch_size, do_sample):
|
def _test_prefill(config_name, generator, batch_size, do_sample):
|
||||||
requests = []
|
requests = []
|
||||||
max_new_tokens = 20
|
max_new_tokens = 20
|
||||||
input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
|
input_text = (
|
||||||
|
"It was a bright cold day in April, and the clocks were striking thirteen."
|
||||||
|
)
|
||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
|
requests.append(
|
||||||
|
create_request(
|
||||||
|
id=i,
|
||||||
|
inputs=input_text,
|
||||||
|
do_sample=do_sample,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
)
|
||||||
|
)
|
||||||
# Let's be pessimistic when estimating max_tokens
|
# Let's be pessimistic when estimating max_tokens
|
||||||
max_length = generator.model.max_length
|
max_length = generator.model.max_length
|
||||||
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
|
batch = Batch(
|
||||||
|
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
|
||||||
|
)
|
||||||
generations, next_batch = generator.prefill(batch)
|
generations, next_batch = generator.prefill(batch)
|
||||||
assert next_batch.size == batch_size
|
assert next_batch.size == batch_size
|
||||||
# Whatever was passed as max_tokens, the server will correct it
|
# Whatever was passed as max_tokens, the server will correct it
|
||||||
@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config):
|
|||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
|
requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
|
||||||
max_length = generator.model.max_length
|
max_length = generator.model.max_length
|
||||||
batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
|
batch = Batch(
|
||||||
|
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
|
||||||
|
)
|
||||||
generations, _ = generator.prefill(batch)
|
generations, _ = generator.prefill(batch)
|
||||||
# Even if the input text is identical for all requests, the first generated token might
|
# Even if the input text is identical for all requests, the first generated token might
|
||||||
# be different because of the truncation
|
# be different because of the truncation
|
||||||
|
@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
|
tgi_router_env_vars = [
|
||||||
|
"MAX_BATCH_SIZE",
|
||||||
|
"MAX_TOTAL_TOKENS",
|
||||||
|
"MAX_INPUT_TOKENS",
|
||||||
|
"MAX_BATCH_PREFILL_TOKENS",
|
||||||
|
]
|
||||||
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
|
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
|
||||||
|
|
||||||
env_config_peering = [
|
env_config_peering = [
|
||||||
@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
|
|||||||
argv = sys.argv
|
argv = sys.argv
|
||||||
# All these are params passed to tgi and intercepted here
|
# All these are params passed to tgi and intercepted here
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
|
"--max-input-tokens",
|
||||||
|
type=int,
|
||||||
|
default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-batch-prefill-tokens",
|
||||||
|
type=int,
|
||||||
|
default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
|
||||||
)
|
)
|
||||||
parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
|
|
||||||
parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
|
|
||||||
parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
|
|
||||||
parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
|
parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
|
||||||
parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
|
parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
|
||||||
|
|
||||||
args = parser.parse_known_args(argv)[0]
|
args = parser.parse_known_args(argv)[0]
|
||||||
|
|
||||||
if not args.model_id:
|
if not args.model_id:
|
||||||
raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
|
raise Exception(
|
||||||
|
"No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
|
||||||
|
)
|
||||||
|
|
||||||
# Override env with cmdline params
|
# Override env with cmdline params
|
||||||
os.environ["MODEL_ID"] = args.model_id
|
os.environ["MODEL_ID"] = args.model_id
|
||||||
@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config):
|
|||||||
f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
|
f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
|
||||||
max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
|
max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
|
||||||
if not max_batch_prefill_tokens:
|
if not max_batch_prefill_tokens:
|
||||||
max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
|
max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
|
||||||
|
max_input_tokens
|
||||||
|
)
|
||||||
f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
|
f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
|
||||||
|
|
||||||
|
|
||||||
@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary):
|
|||||||
return -dictionary["num_cores"], -dictionary["batch_size"]
|
return -dictionary["num_cores"], -dictionary["batch_size"]
|
||||||
|
|
||||||
|
|
||||||
def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
|
def lookup_compatible_cached_model(
|
||||||
|
model_id: str, revision: Optional[str]
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
# Reuse the same mechanic as the one in use to configure the tgi server part
|
# Reuse the same mechanic as the one in use to configure the tgi server part
|
||||||
# The only difference here is that we stay as flexible as possible on the compatibility part
|
# The only difference here is that we stay as flexible as possible on the compatibility part
|
||||||
entries = get_hub_cached_entries(model_id, "inference")
|
entries = get_hub_cached_entries(model_id, "inference")
|
||||||
|
|
||||||
logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
|
logger.debug(
|
||||||
|
"Found %d cached entries for model %s, revision %s",
|
||||||
|
len(entries),
|
||||||
|
model_id,
|
||||||
|
revision,
|
||||||
|
)
|
||||||
|
|
||||||
all_compatible = []
|
all_compatible = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
|
if check_env_and_neuron_config_compatibility(
|
||||||
|
entry, check_compiler_version=True
|
||||||
|
):
|
||||||
all_compatible.append(entry)
|
all_compatible.append(entry)
|
||||||
|
|
||||||
if not all_compatible:
|
if not all_compatible:
|
||||||
@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
|
|||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
|
||||||
def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
|
def check_env_and_neuron_config_compatibility(
|
||||||
|
neuron_config: Dict[str, Any], check_compiler_version: bool
|
||||||
|
) -> bool:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Checking the provided neuron config %s is compatible with the local setup and provided environment",
|
"Checking the provided neuron config %s is compatible with the local setup and provided environment",
|
||||||
neuron_config,
|
neuron_config,
|
||||||
@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
|
|||||||
|
|
||||||
# Local setup compat checks
|
# Local setup compat checks
|
||||||
if neuron_config["num_cores"] > available_cores:
|
if neuron_config["num_cores"] > available_cores:
|
||||||
logger.debug("Not enough neuron cores available to run the provided neuron config")
|
logger.debug(
|
||||||
|
"Not enough neuron cores available to run the provided neuron config"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
|
if (
|
||||||
|
check_compiler_version
|
||||||
|
and neuron_config["compiler_version"] != neuronxcc_version
|
||||||
|
):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
|
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
|
||||||
neuronxcc_version,
|
neuronxcc_version,
|
||||||
@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
|
|||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
|
max_input_tokens = int(
|
||||||
|
os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
|
||||||
|
)
|
||||||
if max_input_tokens > 0:
|
if max_input_tokens > 0:
|
||||||
sequence_length = neuron_config["sequence_length"]
|
sequence_length = neuron_config["sequence_length"]
|
||||||
if max_input_tokens >= sequence_length:
|
if max_input_tokens >= sequence_length:
|
||||||
@ -191,7 +228,10 @@ def main():
|
|||||||
if not os.getenv(env_var):
|
if not os.getenv(env_var):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
|
logger.info(
|
||||||
|
"All env vars %s already set, skipping, user know what they are doing",
|
||||||
|
env_vars,
|
||||||
|
)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
cache_dir = constants.HF_HUB_CACHE
|
cache_dir = constants.HF_HUB_CACHE
|
||||||
@ -201,7 +241,9 @@ def main():
|
|||||||
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
|
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
|
||||||
neuron_config = getattr(config, "neuron", None)
|
neuron_config = getattr(config, "neuron", None)
|
||||||
if neuron_config is not None:
|
if neuron_config is not None:
|
||||||
compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
|
compatible = check_env_and_neuron_config_compatibility(
|
||||||
|
neuron_config, check_compiler_version=False
|
||||||
|
)
|
||||||
if not compatible:
|
if not compatible:
|
||||||
env_dict = get_env_dict()
|
env_dict = get_env_dict()
|
||||||
msg = (
|
msg = (
|
||||||
@ -213,9 +255,9 @@ def main():
|
|||||||
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
|
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
|
||||||
|
|
||||||
if not neuron_config:
|
if not neuron_config:
|
||||||
msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
|
msg = (
|
||||||
get_env_dict(), available_cores, neuronxcc_version
|
"No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
|
||||||
)
|
).format(get_env_dict(), available_cores, neuronxcc_version)
|
||||||
logger.error(msg)
|
logger.error(msg)
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
|
|
||||||
|
@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items):
|
|||||||
def skip_release(item):
|
def skip_release(item):
|
||||||
if "release" in item.keywords:
|
if "release" in item.keywords:
|
||||||
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
|
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
|
||||||
|
|
||||||
selectors.append(skip_release)
|
selectors.append(skip_release)
|
||||||
if config.getoption("--neuron"):
|
if config.getoption("--neuron"):
|
||||||
|
|
||||||
def skip_not_neuron(item):
|
def skip_not_neuron(item):
|
||||||
if "neuron" not in item.keywords:
|
if "neuron" not in item.keywords:
|
||||||
item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option"))
|
item.add_marker(
|
||||||
|
pytest.mark.skip(reason="incompatible with --neuron option")
|
||||||
|
)
|
||||||
|
|
||||||
selectors.append(skip_not_neuron)
|
selectors.append(skip_not_neuron)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
def skip_neuron(item):
|
def skip_neuron(item):
|
||||||
if "neuron" in item.keywords:
|
if "neuron" in item.keywords:
|
||||||
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
|
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
|
||||||
|
|
||||||
selectors.append(skip_neuron)
|
selectors.append(skip_neuron)
|
||||||
for item in items:
|
for item in items:
|
||||||
for selector in selectors:
|
for selector in selectors:
|
||||||
|
@ -30,44 +30,74 @@ logger = logging.getLogger(__file__)
|
|||||||
MODEL_CONFIGURATIONS = {
|
MODEL_CONFIGURATIONS = {
|
||||||
"gpt2": {
|
"gpt2": {
|
||||||
"model_id": "gpt2",
|
"model_id": "gpt2",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 1024,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"llama": {
|
"llama": {
|
||||||
"model_id": "unsloth/Llama-3.2-1B-Instruct",
|
"model_id": "unsloth/Llama-3.2-1B-Instruct",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 2048,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"mistral": {
|
"mistral": {
|
||||||
"model_id": "optimum/mistral-1.1b-testing",
|
"model_id": "optimum/mistral-1.1b-testing",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "bf16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"qwen2": {
|
"qwen2": {
|
||||||
"model_id": "Qwen/Qwen2.5-0.5B",
|
"model_id": "Qwen/Qwen2.5-0.5B",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "fp16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"granite": {
|
"granite": {
|
||||||
"model_id": "ibm-granite/granite-3.1-2b-instruct",
|
"model_id": "ibm-granite/granite-3.1-2b-instruct",
|
||||||
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
"export_kwargs": {
|
||||||
|
"batch_size": 4,
|
||||||
|
"sequence_length": 4096,
|
||||||
|
"num_cores": 2,
|
||||||
|
"auto_cast_type": "bf16",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_neuron_backend_hash():
|
def get_neuron_backend_hash():
|
||||||
import subprocess
|
import subprocess
|
||||||
res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
|
|
||||||
capture_output=True,
|
res = subprocess.run(
|
||||||
text=True)
|
["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
|
||||||
root_dir = res.stdout.split('\n')[0]
|
)
|
||||||
|
root_dir = res.stdout.split("\n")[0]
|
||||||
|
|
||||||
def get_sha(path):
|
def get_sha(path):
|
||||||
res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
|
res = subprocess.run(
|
||||||
|
["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True)
|
text=True,
|
||||||
|
)
|
||||||
# Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
|
# Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
|
||||||
sha = res.stdout.split('\t')[0].split(' ')[-1]
|
sha = res.stdout.split("\t")[0].split(" ")[-1]
|
||||||
return sha.encode()
|
return sha.encode()
|
||||||
|
|
||||||
# We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
|
# We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
|
||||||
m = hashlib.sha256()
|
m = hashlib.sha256()
|
||||||
m.update(get_sha('backends/neuron'))
|
m.update(get_sha("backends/neuron"))
|
||||||
m.update(get_sha('Dockerfile.neuron'))
|
m.update(get_sha("Dockerfile.neuron"))
|
||||||
return m.hexdigest()[:10]
|
return m.hexdigest()[:10]
|
||||||
|
|
||||||
|
|
||||||
@ -81,7 +111,9 @@ def get_tgi_docker_image():
|
|||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
images = client.images.list(filters={"reference": "text-generation-inference"})
|
images = client.images.list(filters={"reference": "text-generation-inference"})
|
||||||
if not images:
|
if not images:
|
||||||
raise ValueError("No text-generation-inference image found on this host to run tests.")
|
raise ValueError(
|
||||||
|
"No text-generation-inference image found on this host to run tests."
|
||||||
|
)
|
||||||
docker_image = images[0].tags[0]
|
docker_image = images[0].tags[0]
|
||||||
return docker_image
|
return docker_image
|
||||||
|
|
||||||
@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
with tempfile.TemporaryDirectory() as context_dir:
|
with tempfile.TemporaryDirectory() as context_dir:
|
||||||
# Create entrypoint
|
# Create entrypoint
|
||||||
model_path = "/data/neuron_model"
|
model_path = "/data/neuron_model"
|
||||||
export_command = f"optimum-cli export neuron -m {model_id} --task text-generation"
|
export_command = (
|
||||||
|
f"optimum-cli export neuron -m {model_id} --task text-generation"
|
||||||
|
)
|
||||||
for kwarg, value in export_kwargs.items():
|
for kwarg, value in export_kwargs.items():
|
||||||
export_command += f" --{kwarg} {str(value)}"
|
export_command += f" --{kwarg} {str(value)}"
|
||||||
export_command += f" {model_path}"
|
export_command += f" {model_path}"
|
||||||
@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
||||||
f.write(docker_content.encode("utf-8"))
|
f.write(docker_content.encode("utf-8"))
|
||||||
f.flush()
|
f.flush()
|
||||||
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image)
|
image, logs = client.images.build(
|
||||||
|
path=context_dir, dockerfile=f.name, tag=export_image
|
||||||
|
)
|
||||||
logger.info("Successfully built image %s", image.id)
|
logger.info("Successfully built image %s", image.id)
|
||||||
logger.debug("Build logs %s", logs)
|
logger.debug("Build logs %s", logs)
|
||||||
|
|
||||||
|
@ -27,7 +27,9 @@ def get_tgi_docker_image():
|
|||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
images = client.images.list(filters={"reference": "text-generation-inference"})
|
images = client.images.list(filters={"reference": "text-generation-inference"})
|
||||||
if not images:
|
if not images:
|
||||||
raise ValueError("No text-generation-inference image found on this host to run tests.")
|
raise ValueError(
|
||||||
|
"No text-generation-inference image found on this host to run tests."
|
||||||
|
)
|
||||||
docker_image = images[0].tags[0]
|
docker_image = images[0].tags[0]
|
||||||
return docker_image
|
return docker_image
|
||||||
|
|
||||||
@ -131,13 +133,21 @@ def neuron_launcher(event_loop):
|
|||||||
except NotFound:
|
except NotFound:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
|
env = {
|
||||||
|
"LOG_LEVEL": "info,text_generation_router=debug",
|
||||||
|
"CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
|
||||||
|
}
|
||||||
|
|
||||||
if HF_TOKEN is not None:
|
if HF_TOKEN is not None:
|
||||||
env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
|
||||||
env["HF_TOKEN"] = HF_TOKEN
|
env["HF_TOKEN"] = HF_TOKEN
|
||||||
|
|
||||||
for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
|
for var in [
|
||||||
|
"MAX_BATCH_SIZE",
|
||||||
|
"MAX_TOTAL_TOKENS",
|
||||||
|
"HF_AUTO_CAST_TYPE",
|
||||||
|
"HF_NUM_CORES",
|
||||||
|
]:
|
||||||
if var in os.environ:
|
if var in os.environ:
|
||||||
env[var] = os.environ[var]
|
env[var] = os.environ[var]
|
||||||
|
|
||||||
@ -165,7 +175,9 @@ def neuron_launcher(event_loop):
|
|||||||
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
||||||
f.write(docker_content.encode("utf-8"))
|
f.write(docker_content.encode("utf-8"))
|
||||||
f.flush()
|
f.flush()
|
||||||
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
|
image, logs = client.images.build(
|
||||||
|
path=context_dir, dockerfile=f.name, tag=test_image
|
||||||
|
)
|
||||||
logger.info("Successfully built image %s", image.id)
|
logger.info("Successfully built image %s", image.id)
|
||||||
logger.debug("Build logs %s", logs)
|
logger.debug("Build logs %s", logs)
|
||||||
else:
|
else:
|
||||||
@ -204,7 +216,9 @@ def neuron_launcher(event_loop):
|
|||||||
try:
|
try:
|
||||||
container.remove(force=True)
|
container.remove(force=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error while removing container %s, skipping", container_name)
|
logger.error(
|
||||||
|
"Error while removing container %s, skipping", container_name
|
||||||
|
)
|
||||||
logger.exception(e)
|
logger.exception(e)
|
||||||
|
|
||||||
# Cleanup the build image
|
# Cleanup the build image
|
||||||
@ -243,7 +257,12 @@ def neuron_generate_load():
|
|||||||
client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
|
client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
|
||||||
) -> List[TextGenerationOutput]:
|
) -> List[TextGenerationOutput]:
|
||||||
futures = [
|
futures = [
|
||||||
client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
|
client.text_generation(
|
||||||
|
prompt,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
details=True,
|
||||||
|
decoder_input_details=True,
|
||||||
|
)
|
||||||
for _ in range(n)
|
for _ in range(n)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service):
|
|||||||
|
|
||||||
# Greedy bounded with input
|
# Greedy bounded with input
|
||||||
response = await tgi_service.client.text_generation(
|
response = await tgi_service.client.text_generation(
|
||||||
"What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
|
"What is Deep Learning?",
|
||||||
|
max_new_tokens=17,
|
||||||
|
return_full_text=True,
|
||||||
|
details=True,
|
||||||
|
decoder_input_details=True,
|
||||||
)
|
)
|
||||||
assert response.details.generated_tokens == 17
|
assert response.details.generated_tokens == 17
|
||||||
assert response.generated_text == prompt + greedy_expectations[service_name]
|
assert response.generated_text == prompt + greedy_expectations[service_name]
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from huggingface_hub.errors import ValidationError
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
|
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
|
||||||
|
Loading…
Reference in New Issue
Block a user