Bump neuron SDK version (#3260)

* chore(neuron): bump version to 0.2.0

* refactor(neuron): use named parameters in inputs helpers

This allows to hide the differences between the two backends in terms of
input parameters.

* refactor(neuron): remove obsolete code paths

* fix(neuron): use neuron_config whenever possible

* fix(neuron): use new cache import path

* fix(neuron): neuron config is not stored in config anymore

* fix(nxd): adapt model retrieval to new APIs

* fix(generator): emulate greedy in sampling parameters

When on-device sampling is enabled, we need to emulate the greedy
behaviour using top-k=1, top-p=1, temperature=1.

* test(neuron): update models and expectations

* feat(neuron): support on-device sampling

* fix(neuron): adapt entrypoint

* tests(neuron): remove obsolete models

* fix(neuron): adjust test expectations for llama on nxd
This commit is contained in:
David Corvoysier 2025-06-10 17:56:25 +02:00 committed by GitHub
parent 1ff9d185d5
commit 79183d1647
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 393 additions and 264 deletions

View File

@ -5,7 +5,7 @@ RUN mkdir -p /tgi
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
FROM alpine AS optimum-neuron
RUN mkdir -p /optimum-neuron
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.0.tar.gz /optimum-neuron/sources.tar.gz
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
# Build cargo components (adapted from TGI original Dockerfile)
@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
# Install neuronx packages
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
aws-neuronx-dkms=2.19.64.0 \
aws-neuronx-collectives=2.23.135.0-3e70920f2 \
aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
aws-neuronx-tools=2.20.204.0 \
aws-neuronx-dkms=2.20.28.0 \
aws-neuronx-collectives=2.24.59.0-838c7fc8b \
aws-neuronx-runtime-lib=2.24.53.0-f239092cc \
aws-neuronx-tools=2.22.61.0 \
libxml2 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
@ -125,11 +125,10 @@ RUN pip3 install \
--index-url https://download.pytorch.org/whl/cpu
RUN pip3 install \
neuronx-cc==2.16.372.0 \
torch-neuronx==2.5.1.2.4.0 \
transformers-neuronx==0.13.322 \
neuronx-distributed==0.10.1 \
libneuronxla==2.1.681.0 \
neuronx-cc==2.17.194.0 \
torch-neuronx==2.5.1.2.6.0 \
neuronx-distributed==0.11.0 \
libneuronxla==2.2.1630.0 \
--extra-index-url=https://pip.repos.neuron.amazonaws.com
# Install HuggingFace packages
@ -160,7 +159,7 @@ RUN pip install dist/text_generation_server*.tar.gz
# Final image
FROM neuron
COPY backends/neuron/tgi_env.py /tgi_env.py
COPY backends/neuron/tgi_entry_point.py /tgi_entry_point.py
COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

View File

@ -7,7 +7,8 @@ from typing import List, Optional, Tuple
import torch
from loguru import logger
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from optimum.neuron.configuration_utils import NeuronConfig
from transformers.generation import GenerationConfig
from optimum.neuron import NeuronModelForCausalLM
@ -175,6 +176,12 @@ class Slot:
self._generation_config.top_p = request.parameters.top_p
if request.parameters.typical_p != 0:
self._generation_config.typical_p = request.parameters.typical_p
else:
# Set the sampling parameters to emulate greedy decoding when using on-device sampling
self._generation_config.temperature = 1.0
self._generation_config.top_k = 1
self._generation_config.top_p = 1.0
self._generation_config.typical_p = 1.0
if request.parameters.repetition_penalty != 0:
self._generation_config.repetition_penalty = (
request.parameters.repetition_penalty
@ -211,19 +218,11 @@ class Slot:
self._mask = attention_mask.clone()
self._selector = selector
def pause(self, reset_on_pause: bool):
def pause(self):
"""Mark the current slot as paused for generation.
Note that the KV cache for this slot will still be filled.
"""
if reset_on_pause:
# Drop the last token as it will be added back when resuming the slot
self._generated_tokens -= 1
# Since generated tokens are now part of the prefill, we need to reevaluate
# max_new_tokens for the next generation
self._generation_config.max_new_tokens = (
self._max_new_tokens - self._generated_tokens
)
self._state = Slot.State.PAUSE
def resume(self):
@ -340,16 +339,27 @@ class NeuronGenerator(Generator):
tokenizer: PreTrainedTokenizerBase,
):
self.model = model
self.rebuild_cache_on_prefill = not self.model.continuous_batching
if not isinstance(self.model, NeuronModelForCausalLM):
raise ValueError("The model must be a NeuronModelForCausalLM.")
if not model.neuron_config.continuous_batching:
raise ValueError(
"The neuron model must be compiled with continuous_batching=True."
)
# Specify padding and truncation options for decoder-only architecture
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"
self.tokenizer = tokenizer
self.special_tokens = self.tokenizer.all_special_ids
self.slots = [Slot(i, tokenizer) for i in range(self.model.batch_size)]
self.slots = [
Slot(i, tokenizer) for i in range(self.model.neuron_config.batch_size)
]
self.batch_id = 0
@property
def on_device_sampling(self) -> bool:
return getattr(self.model.neuron_config, "on_device_sampling", False)
@property
def info(self) -> InfoResponse:
"""Returns the expected InfoResponse."""
@ -371,14 +381,22 @@ class NeuronGenerator(Generator):
The maximum number of tokens the model supports.
"""
# Just check that the warmup request parameters match the model capacity
batch_size = self.model.batch_size
batch_size = self.model.neuron_config.batch_size
if len(batch.requests) > batch_size:
raise ValueError(
f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI. The compiled model batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process. The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI. The compiled model.neuron_config.batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process. The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
)
self.prefill(batch)
self.clear()
return self.model.batch_size * self.model.max_length
return (
self.model.neuron_config.batch_size
* self.model.neuron_config.sequence_length
)
def max_prefill_length(self) -> int:
if hasattr(self.model.neuron_config, "max_context_length"):
return self.model.neuron_config.max_context_length
return self.model.neuron_config.sequence_length
def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
"""Prefill new requests.
@ -398,7 +416,7 @@ class NeuronGenerator(Generator):
if len(empty_slots) < len(batch.requests):
raise ValueError(
f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
f" Please align max_batch_size with the static batch size: {self.model.neuron_config.batch_size}."
)
# Assign each request to an empty slot
logger.debug(
@ -412,12 +430,6 @@ class NeuronGenerator(Generator):
logger.debug(
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
)
if self.rebuild_cache_on_prefill:
# We will clear pending slots and prefill all slots
prefill_slots = self.slots
seq_ids = None
else:
# We only need to pass inputs for the new requests
prefill_slots = new_slots
seq_ids = torch.tensor([slot.id for slot in prefill_slots])
# Reconstruct the full inputs (without padding) as seen by the model.
@ -431,8 +443,10 @@ class NeuronGenerator(Generator):
inputs.append(slot.cached_text)
# Apply truncation, making sure we fit into static dimensions
if slot.truncate == 0:
max_length = self.model.max_length
elif slot.truncate > max_length and slot.truncate < self.model.max_length:
max_length = self.max_prefill_length()
elif (
slot.truncate > max_length and slot.truncate < self.max_prefill_length()
):
max_length = slot.truncate
# Tokenize with padding and truncation
padded_inputs = self.tokenizer(
@ -444,13 +458,12 @@ class NeuronGenerator(Generator):
)
input_ids = padded_inputs.input_ids
attention_mask = padded_inputs.attention_mask
sampling_params = (
torch.zeros(input_ids.shape[0], 3) if self.on_device_sampling else None
)
# Pause previously active slots during generation
next_tokens = []
for slot in active_slots:
slot.pause(reset_on_pause=self.rebuild_cache_on_prefill)
if self.rebuild_cache_on_prefill:
# The slot will be reset, so we need to store its next token
next_tokens.append(slot.next_token)
slot.pause()
# Each slot must be reset with the padded inputs and masks
for i, slot in enumerate(prefill_slots):
if slot.state != slot.state.EMPTY:
@ -464,29 +477,33 @@ class NeuronGenerator(Generator):
slot_input_ids,
slot.generation_config,
self.model,
self.model.max_length,
self.model.neuron_config.sequence_length,
tokenizer=self.tokenizer,
seed=slot.seed,
)
slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
slot_attention_mask = attention_mask[i]
slot.reset(slot_input_ids, slot_attention_mask, selector)
if sampling_params is not None:
sampling_params[i, 0] = slot.generation_config.top_k
sampling_params[i, 1] = slot.generation_config.top_p
sampling_params[i, 2] = slot.generation_config.temperature
# Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
# as they have already been generated and sent back in the last decode.
model_inputs = self.model.prepare_inputs_for_prefill(
input_ids, attention_mask, seq_ids
input_ids,
attention_mask=attention_mask,
seq_ids=seq_ids,
sampling_params=sampling_params,
)
logits = self.model(**model_inputs)[0]
tokens_or_logits = self.model(**model_inputs)[0]
generation, next_batch = self._generate_token(
prefill_slots, self.batch_id, logits, input_ids
prefill_slots, self.batch_id, tokens_or_logits, input_ids
)
self.batch_id += 1
# Reactivate previously active slots for the next decode
for i, slot in enumerate(active_slots):
slot.resume()
if self.rebuild_cache_on_prefill:
# Append back the next token
slot.append(next_tokens[i])
logger.debug("Model ready for decoding")
if next_batch is not None:
logger.debug(
@ -530,12 +547,8 @@ class NeuronGenerator(Generator):
raise ValueError(
"Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
)
if self.model.continuous_batching:
decode_slots = active_slots
seq_ids = torch.tensor([slot.id for slot in decode_slots])
else:
decode_slots = self.slots
seq_ids = None
# Reconstruct input_ids and attention_mask from decode slots
n_slots = len(decode_slots)
input_ids = torch.full(
@ -545,22 +558,32 @@ class NeuronGenerator(Generator):
for slot in decode_slots:
max_length = max(max_length, slot.attention_mask.size(-1))
attention_mask = torch.zeros([n_slots, max_length], dtype=torch.int64)
sampling_params = torch.zeros(n_slots, 3) if self.on_device_sampling else None
for i, slot in enumerate(decode_slots):
if slot.state != Slot.State.EMPTY:
# input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
input_ids[i, 0] = slot.next_token
attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
if sampling_params is not None:
sampling_params[i, 0] = slot.generation_config.top_k
sampling_params[i, 1] = slot.generation_config.top_p
sampling_params[i, 2] = slot.generation_config.temperature
model_inputs = self.model.prepare_inputs_for_decode(
input_ids, attention_mask, seq_ids
input_ids,
attention_mask=attention_mask,
seq_ids=seq_ids,
sampling_params=sampling_params,
)
tokens_or_logits = self.model(**model_inputs)[0]
return self._generate_token(
decode_slots, next_batch_id, tokens_or_logits, input_ids
)
logits = self.model(**model_inputs)[0]
return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
def _generate_token(
self,
slots: List[Slot],
next_batch_id: int,
logits: torch.Tensor,
tokens_or_logits: torch.Tensor,
input_ids: torch.LongTensor,
) -> Tuple[List[Generation], CachedBatch]:
generations = []
@ -569,8 +592,11 @@ class NeuronGenerator(Generator):
if slot.state != Slot.State.READY:
continue
request_id = slot.request_id
next_token_logits = logits[i : i + 1, -1, :]
slot_input_ids = input_ids[i : i + 1, :]
if self.on_device_sampling:
next_token = tokens_or_logits[i]
else:
next_token_logits = tokens_or_logits[i : i + 1, -1, :]
next_token = slot.select(slot_input_ids, next_token_logits)
next_token_text = slot.append(next_token)
generated_text = None
@ -622,7 +648,7 @@ class NeuronGenerator(Generator):
def _cached_batch(self, batch_id: int, request_ids: List):
size = len(request_ids)
max_tokens = size * self.model.max_length
max_tokens = size * self.model.neuron_config.sequence_length
return CachedBatch(
id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
)
@ -671,8 +697,16 @@ class NeuronGenerator(Generator):
Returns:
A NeuronGenerator.
"""
config = AutoConfig.from_pretrained(model_id)
neuron_config = getattr(config, "neuron", None)
try:
neuron_config = NeuronConfig.from_pretrained(model_id, revision=revision)
except Exception as e:
logger.debug(
"NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
model_id,
revision,
e,
)
neuron_config = None
start = time.time()
if neuron_config is None:
export_kwargs = get_export_kwargs_from_env()

View File

@ -6,10 +6,12 @@ from typing import Optional
from huggingface_hub import snapshot_download
from huggingface_hub.constants import HF_HUB_CACHE
from loguru import logger
from transformers import AutoConfig
from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.utils import get_hub_cached_entries
from optimum.neuron.cache import get_hub_cached_entries
from optimum.neuron.configuration_utils import NeuronConfig
from .tgi_env import check_env_and_neuron_config_compatibility
def get_export_kwargs_from_env():
@ -24,7 +26,6 @@ def get_export_kwargs_from_env():
num_cores = int(num_cores)
auto_cast_type = os.environ.get("HF_AUTO_CAST_TYPE", None)
return {
"task": "text-generation",
"batch_size": batch_size,
"sequence_length": sequence_length,
"num_cores": num_cores,
@ -32,20 +33,15 @@ def get_export_kwargs_from_env():
}
def is_cached(model_id, neuron_config):
def is_cached(model_id):
# Look for cached entries for the specified model
in_cache = False
entries = get_hub_cached_entries(model_id, "inference")
entries = get_hub_cached_entries(model_id)
# Look for compatible entries
for entry in entries:
compatible = True
for key, value in neuron_config.items():
# Only weights can be different
if key in ["checkpoint_id", "checkpoint_revision"]:
continue
if entry[key] != value:
compatible = False
if compatible:
if check_env_and_neuron_config_compatibility(
entry, check_compiler_version=True
):
in_cache = True
break
return in_cache
@ -87,8 +83,16 @@ def fetch_model(
revision = None
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
# Note that the model may already be present in the cache.
config = AutoConfig.from_pretrained(model_id, revision=revision)
neuron_config = getattr(config, "neuron", None)
try:
neuron_config = NeuronConfig.from_pretrained(model_id, revision=revision)
except Exception as e:
logger.debug(
"NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
model_id,
revision,
e,
)
neuron_config = None
if neuron_config is not None:
if os.path.isdir(model_id):
return model_id
@ -99,16 +103,11 @@ def fetch_model(
log_cache_size()
return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
# Model needs to be exported: look for compatible cached entries on the hub
export_kwargs = get_export_kwargs_from_env()
export_config = NeuronModelForCausalLM.get_export_config(
model_id, config, revision=revision, **export_kwargs
)
neuron_config = export_config.neuron
if not is_cached(model_id, neuron_config):
if not is_cached(model_id):
hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
error_msg = (
f"No cached version found for {model_id} with {neuron_config}."
f"No cached version found for {model_id} with {get_export_kwargs_from_env()}."
f"You can start a discussion to request it on {hub_cache_url}"
f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
)
@ -121,8 +120,10 @@ def fetch_model(
# Prefetch weights, tokenizer and generation config so that they are in cache
log_cache_size()
start = time.time()
snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
snapshot_path = snapshot_download(
model_id, revision=revision, ignore_patterns="*.bin"
)
end = time.time()
logger.info(f"Model weights fetched in {end - start:.2f} s.")
log_cache_size()
return model_id
return snapshot_path

View File

@ -6,12 +6,11 @@ import os
import sys
from typing import Any, Dict, List, Optional
from huggingface_hub import constants
from transformers import AutoConfig
from optimum.neuron.modeling_decoder import get_available_cores
from optimum.neuron.utils import get_hub_cached_entries
from optimum.neuron.cache import get_hub_cached_entries
from optimum.neuron.configuration_utils import NeuronConfig
from optimum.neuron.utils.version_utils import get_neuronxcc_version
from optimum.neuron.utils import map_torch_dtype
logger = logging.getLogger(__name__)
@ -24,15 +23,9 @@ tgi_router_env_vars = [
]
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
env_config_peering = [
("MAX_BATCH_SIZE", "batch_size"),
("MAX_TOTAL_TOKENS", "sequence_length"),
("HF_AUTO_CAST_TYPE", "auto_cast_type"),
("HF_NUM_CORES", "num_cores"),
]
# By the end of this script all env var should be specified properly
env_vars = tgi_server_env_vars + tgi_router_env_vars
tgi_env_vars = tgi_server_env_vars + tgi_router_env_vars
available_cores = get_available_cores()
neuronxcc_version = get_neuronxcc_version()
@ -93,9 +86,17 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
def neuron_config_to_env(neuron_config):
if isinstance(neuron_config, NeuronConfig):
neuron_config = neuron_config.to_dict()
with open(os.environ["ENV_FILEPATH"], "w") as f:
for env_var, config_key in env_config_peering:
f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
f.write("export MAX_BATCH_SIZE={}\n".format(neuron_config["batch_size"]))
f.write("export MAX_TOTAL_TOKENS={}\n".format(neuron_config["sequence_length"]))
f.write("export HF_NUM_CORES={}\n".format(neuron_config["tp_degree"]))
config_key = (
"auto_cast_type" if "auto_cast_type" in neuron_config else "torch_dtype"
)
auto_cast_type = neuron_config[config_key]
f.write("export HF_AUTO_CAST_TYPE={}\n".format(auto_cast_type))
max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
if not max_input_tokens:
max_input_tokens = int(neuron_config["sequence_length"]) // 2
@ -111,7 +112,7 @@ def neuron_config_to_env(neuron_config):
def sort_neuron_configs(dictionary):
return -dictionary["num_cores"], -dictionary["batch_size"]
return -dictionary["tp_degree"], -dictionary["batch_size"]
def lookup_compatible_cached_model(
@ -119,7 +120,7 @@ def lookup_compatible_cached_model(
) -> Optional[Dict[str, Any]]:
# Reuse the same mechanic as the one in use to configure the tgi server part
# The only difference here is that we stay as flexible as possible on the compatibility part
entries = get_hub_cached_entries(model_id, "inference")
entries = get_hub_cached_entries(model_id)
logger.debug(
"Found %d cached entries for model %s, revision %s",
@ -155,15 +156,15 @@ def lookup_compatible_cached_model(
def check_env_and_neuron_config_compatibility(
neuron_config: Dict[str, Any], check_compiler_version: bool
neuron_config_dict: Dict[str, Any], check_compiler_version: bool
) -> bool:
logger.debug(
"Checking the provided neuron config %s is compatible with the local setup and provided environment",
neuron_config,
neuron_config_dict,
)
# Local setup compat checks
if neuron_config["num_cores"] > available_cores:
if neuron_config_dict["tp_degree"] > available_cores:
logger.debug(
"Not enough neuron cores available to run the provided neuron config"
)
@ -171,33 +172,65 @@ def check_env_and_neuron_config_compatibility(
if (
check_compiler_version
and neuron_config["compiler_version"] != neuronxcc_version
and neuron_config_dict["neuronxcc_version"] != neuronxcc_version
):
logger.debug(
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
neuronxcc_version,
neuron_config["compiler_version"],
neuron_config_dict["neuronxcc_version"],
)
return False
for env_var, config_key in env_config_peering:
neuron_config_value = str(neuron_config[config_key])
env_value = os.getenv(env_var, str(neuron_config_value))
batch_size = os.getenv("MAX_BATCH_SIZE", None)
if batch_size is not None and neuron_config_dict["batch_size"] < int(batch_size):
logger.debug(
"The provided MAX_BATCH_SIZE (%s) is higher than the neuron config batch size (%s)",
os.getenv("MAX_BATCH_SIZE"),
neuron_config_dict["batch_size"],
)
return False
max_total_tokens = os.getenv("MAX_TOTAL_TOKENS", None)
if max_total_tokens is not None and neuron_config_dict["sequence_length"] < int(
max_total_tokens
):
logger.debug(
"The provided MAX_TOTAL_TOKENS (%s) is higher than the neuron config sequence length (%s)",
max_total_tokens,
neuron_config_dict["sequence_length"],
)
return False
num_cores = os.getenv("HF_NUM_CORES", None)
if num_cores is not None and neuron_config_dict["tp_degree"] < int(num_cores):
logger.debug(
"The provided HF_NUM_CORES (%s) is higher than the neuron config tp degree (%s)",
num_cores,
neuron_config_dict["tp_degree"],
)
return False
auto_cast_type = os.getenv("HF_AUTO_CAST_TYPE", None)
if auto_cast_type is not None:
config_key = (
"auto_cast_type"
if "auto_cast_type" in neuron_config_dict
else "torch_dtype"
)
neuron_config_value = map_torch_dtype(str(neuron_config_dict[config_key]))
env_value = map_torch_dtype(auto_cast_type)
if env_value != neuron_config_value:
logger.debug(
"The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
env_var,
config_key,
"The provided auto cast type and the neuron config param differ (%s != %s)",
env_value,
neuron_config_value,
)
return False
max_input_tokens = int(
os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
)
if max_input_tokens > 0:
sequence_length = neuron_config["sequence_length"]
if hasattr(neuron_config_dict, "max_context_length"):
sequence_length = neuron_config_dict["max_context_length"]
else:
sequence_length = neuron_config_dict["sequence_length"]
if max_input_tokens >= sequence_length:
logger.debug(
"Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
@ -211,38 +244,29 @@ def check_env_and_neuron_config_compatibility(
def get_env_dict() -> Dict[str, str]:
d = {}
for k in env_vars:
for k in tgi_env_vars:
d[k] = os.getenv(k)
return d
def main():
"""
This script determines proper default TGI env variables for the neuron precompiled models to
work properly
:return:
"""
args = parse_cmdline_and_set_env()
for env_var in env_vars:
if not os.getenv(env_var):
break
else:
logger.info(
"All env vars %s already set, skipping, user know what they are doing",
env_vars,
def get_neuron_config_for_model(
model_name_or_path: str, revision: Optional[str] = None
) -> NeuronConfig:
try:
neuron_config = NeuronConfig.from_pretrained(
model_name_or_path, revision=revision
)
sys.exit(0)
cache_dir = constants.HF_HUB_CACHE
logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
neuron_config = getattr(config, "neuron", None)
except Exception as e:
logger.debug(
"NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
model_name_or_path,
revision,
e,
)
neuron_config = None
if neuron_config is not None:
compatible = check_env_and_neuron_config_compatibility(
neuron_config, check_compiler_version=False
neuron_config.to_dict(), check_compiler_version=False
)
if not compatible:
env_dict = get_env_dict()
@ -252,17 +276,6 @@ def main():
logger.error(msg)
raise Exception(msg)
else:
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
neuron_config = lookup_compatible_cached_model(model_name_or_path, revision)
if not neuron_config:
msg = (
"No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
).format(get_env_dict(), available_cores, neuronxcc_version)
logger.error(msg)
raise Exception(msg)
neuron_config_to_env(neuron_config)
if __name__ == "__main__":
main()
return neuron_config

View File

@ -4,14 +4,12 @@ import subprocess
import sys
from tempfile import TemporaryDirectory
import huggingface_hub
import os
import pytest
from transformers import AutoTokenizer
from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.utils import synchronize_hub_cache
from optimum.neuron.version import __sdk_version__ as sdk_version
from optimum.neuron.version import __version__ as version
from optimum.neuron.cache import synchronize_hub_cache
logging.basicConfig(
@ -21,30 +19,14 @@ logging.basicConfig(
)
logger = logging.getLogger(__file__)
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
# All model configurations below will be added to the neuron_model_config fixture
MODEL_CONFIGURATIONS = {
"gpt2": {
"model_id": "gpt2",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 1024,
"num_cores": 2,
"auto_cast_type": "fp16",
},
},
"llama": {
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 2048,
"num_cores": 2,
"auto_cast_type": "fp16",
},
},
"mistral": {
"model_id": "optimum/mistral-1.1b-testing",
"model_id": "unsloth/Llama-3.2-1B-Instruct",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "fp16",
"auto_cast_type": "bf16",
},
},
"granite": {
@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = {
}
def get_hub_neuron_model_id(config_name: str):
return (
f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
)
def export_model(model_id, export_kwargs, neuron_model_path):
export_command = [
"optimum-cli",
@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path):
def neuron_model_config(request):
"""Expose a pre-trained neuron model
The fixture first makes sure the following model artifacts are present on the hub:
- exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
- cached artifacts under optimum-internal-testing/neuron-testing-cache.
If not, it will export the model and push it to the hub.
It then fetches the model locally and return a dictionary containing:
The fixture exports a model locally and returns a dictionary containing:
- a configuration name,
- the original model id,
- the export parameters,
- the neuron model id,
- the neuron model local path.
For each exposed model, the local directory is maintained for the duration of the
test session and cleaned up afterwards.
The hub model artifacts are never cleaned up and persist accross sessions.
They must be cleaned up manually when the optimum-neuron version changes.
"""
config_name = request.param
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
model_id = model_config["model_id"]
export_kwargs = model_config["export_kwargs"]
neuron_model_id = get_hub_neuron_model_id(config_name)
with TemporaryDirectory() as neuron_model_path:
hub = huggingface_hub.HfApi()
if hub.repo_exists(neuron_model_id):
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
else:
export_model(model_id, export_kwargs, neuron_model_path)
synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(neuron_model_path)
del tokenizer
# Create the test model on the hub
hub.create_repo(neuron_model_id, private=True)
hub.upload_folder(
folder_path=neuron_model_path,
repo_id=neuron_model_id,
ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
)
# Make sure it is cached
synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
# Add dynamic parameters to the model configuration
model_config["neuron_model_path"] = neuron_model_path
model_config["neuron_model_id"] = neuron_model_id
# Also add model configuration name to allow tests to adapt their expectations
model_config["name"] = config_name
# Yield instead of returning to keep a reference to the temporary directory.
# It will go out of scope and be released only once all tests needing the fixture
# have been completed.
logger.info(f"{config_name} ready for testing ...")
os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
yield model_config
logger.info(f"Done with {config_name}")

View File

@ -0,0 +1,42 @@
import os
import pytest
from text_generation_server.generator import NeuronGenerator
from text_generation_server.model import fetch_model, is_cached
@pytest.fixture(scope="module")
def cached_model_id(neuron_model_config) -> str:
"""
Fixture to provide a cached model ID for testing.
This assumes the model is already cached in the local environment.
"""
export_kwargs = neuron_model_config["export_kwargs"]
os.environ["MAX_BATCH_SIZE"] = str(export_kwargs["batch_size"])
os.environ["MAX_TOTAL_TOKENS"] = str(export_kwargs["sequence_length"])
os.environ["HF_AUTO_CAST_TYPE"] = export_kwargs["auto_cast_type"]
os.environ["HF_NUM_CORES"] = str(export_kwargs["num_cores"])
yield neuron_model_config["model_id"]
os.environ.pop("MAX_BATCH_SIZE", None)
os.environ.pop("MAX_TOTAL_TOKENS", None)
os.environ.pop("HF_AUTO_CAST_TYPE", None)
os.environ.pop("HF_NUM_CORES", None)
def test_model_is_cached(cached_model_id):
assert is_cached(cached_model_id), f"Model {cached_model_id} is not cached"
def test_fetch_cached_model(cached_model_id: str):
model_path = fetch_model(cached_model_id)
assert os.path.exists(
model_path
), f"Model {cached_model_id} was not fetched successfully"
assert os.path.isdir(model_path), f"Model {cached_model_id} is not a directory"
def test_generator_from_cached_model(cached_model_id: str):
generator = NeuronGenerator.from_pretrained(model_id=cached_model_id)
assert generator is not None, "Generator could not be created from cached model"
assert generator.model is not None, "Generator model is not initialized"
assert generator.tokenizer is not None, "Generator tokenizer is not initialized"

View File

@ -9,13 +9,13 @@ def test_continuous_batching_two_requests(neuron_model_config):
"""
neuron_model_path = neuron_model_config["neuron_model_path"]
generator = NeuronGenerator.from_pretrained(neuron_model_path)
assert generator.model.batch_size > 1
assert generator.model.neuron_config.batch_size > 1
input_text = "Once upon a time"
max_new_tokens = 20
# Prefill a single request, remembering the generated token
tokens = {0: [], 1: []}
request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
max_length = generator.model.max_length
max_length = generator.model.neuron_config.sequence_length
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
generations, next_batch = generator.prefill(batch)
assert next_batch.size == 1

View File

@ -23,7 +23,7 @@ def _test_decode(config_name, generator, do_sample):
request = create_request(
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
)
max_length = generator.model.max_length
max_length = generator.model.neuron_config.sequence_length
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
generations, next_batch = generator.prefill(batch)
# We already generated one token: call decode max_new_tokens - 1 times
@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample):
assert output.finish_reason == 0
if do_sample:
expected_text = {
"gpt2": " The sun was set",
"llama": "George Orwell, 1984",
"mistral": "The sky was",
"qwen2": " A young woman with",
"llama": " I sat alone in the café",
"qwen2": " The air was so still",
"granite": "1984, George Orwell",
}[config_name]
assert expected_text in output.text
else:
print(output.text)
expected_text = {
"gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
"llama": " George Orwells classic dystopian novel, 1984, begins with this ominous sentence. The story",
"mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
"qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
"granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
}[config_name]

View File

@ -9,7 +9,7 @@ def test_prefill(neuron_model_config):
neuron_model_path = neuron_model_config["neuron_model_path"]
generator = NeuronGenerator.from_pretrained(neuron_model_path)
max_batch_size = 4
assert generator.model.batch_size >= max_batch_size
assert generator.model.neuron_config.batch_size >= max_batch_size
for num_requests in [1, max_batch_size]:
for do_sample in [True, False]:
mode = "sample" if do_sample else "greedy"
@ -34,7 +34,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
)
)
# Let's be pessimistic when estimating max_tokens
max_length = generator.model.max_length
max_length = generator.max_prefill_length()
batch = Batch(
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
)
@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
assert len(generations) == batch_size
if do_sample:
expectations = {
"gpt2": [383, " The"],
"llama": [10058, " George"],
"mistral": [450, " The"],
"qwen2": [362, " A"],
"llama": [358, " I"],
"qwen2": [576, " The"],
"granite": [308, " ("],
}[config_name]
else:
expectations = {
"gpt2": [198, "\n"],
"llama": [10058, " George"],
"mistral": [13, "\n"],
"llama": [578, " The"],
"qwen2": [358, " I"],
"granite": [203, "\n"],
}[config_name]
@ -70,7 +66,7 @@ def test_prefill_truncate(neuron_model_config):
config_name = neuron_model_config["name"]
neuron_model_path = neuron_model_config["neuron_model_path"]
generator = NeuronGenerator.from_pretrained(neuron_model_path)
batch_size = generator.model.batch_size
batch_size = generator.model.neuron_config.batch_size
# We apply truncation to all requests but the first one
truncate = [
None,
@ -83,7 +79,7 @@ def test_prefill_truncate(neuron_model_config):
requests = []
for i in range(batch_size):
requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
max_length = generator.model.max_length
max_length = generator.max_prefill_length()
batch = Batch(
id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
)
@ -91,12 +87,12 @@ def test_prefill_truncate(neuron_model_config):
# Even if the input text is identical for all requests, the first generated token might
# be different because of the truncation
expectations = {
"gpt2": [" He", " He", "\n", " He"],
"llama": ["", " The", " He", " He"],
"mistral": [" He", "\n", " He", " He"],
"llama": [" He", "iens", "\x08", " He"],
"qwen2": [" He", " The", " He", " He"],
"granite": ["\n", "\n", " I", " He"],
}[config_name]
for i, g in enumerate(generations):
tokens = g.tokens
assert tokens.texts[0] == expectations[i]
assert (
tokens.texts[0] == expectations[i]
), f"Request {i} expected [{expectations[i]}], got [{tokens.texts[0]}]"

View File

@ -0,0 +1,63 @@
import os
import pytest
from tempfile import TemporaryDirectory
from optimum.neuron.models.inference.nxd.backend.config import NxDNeuronConfig
from optimum.neuron.utils import map_torch_dtype
from text_generation_server.tgi_env import (
get_neuron_config_for_model,
lookup_compatible_cached_model,
neuron_config_to_env,
)
def test_get_neuron_config_for_model(neuron_model_config):
neuron_model_path = neuron_model_config["neuron_model_path"]
export_kwargs = neuron_model_config["export_kwargs"]
os.environ["MAX_BATCH_SIZE"] = str(export_kwargs["batch_size"])
os.environ["MAX_TOTAL_TOKENS"] = str(export_kwargs["sequence_length"])
os.environ["HF_AUTO_CAST_TYPE"] = export_kwargs["auto_cast_type"]
os.environ["HF_NUM_CORES"] = str(export_kwargs["num_cores"])
neuron_config = get_neuron_config_for_model(neuron_model_path)
assert neuron_config is not None
assert neuron_config.batch_size == export_kwargs["batch_size"]
assert neuron_config.sequence_length == export_kwargs["sequence_length"]
assert neuron_config.tp_degree == export_kwargs["num_cores"]
if isinstance(neuron_config, NxDNeuronConfig):
assert map_torch_dtype(neuron_config.torch_dtype) == map_torch_dtype(
export_kwargs["auto_cast_type"]
)
else:
assert map_torch_dtype(neuron_config.auto_cast_type) == map_torch_dtype(
export_kwargs["auto_cast_type"]
)
@pytest.mark.parametrize("model_id", ["unsloth/Llama-3.2-1B-Instruct"])
def test_lookup_compatible_cached_model(model_id: str):
neuron_config = lookup_compatible_cached_model(model_id, None)
assert neuron_config is not None
def test_neuron_config_to_env(neuron_model_config) -> None:
neuron_model_path = neuron_model_config["neuron_model_path"]
neuron_config = get_neuron_config_for_model(neuron_model_path)
with TemporaryDirectory() as temp_dir:
os.environ["ENV_FILEPATH"] = os.path.join(temp_dir, "env.sh")
neuron_config_to_env(neuron_config)
with open(os.environ["ENV_FILEPATH"], "r") as env_file:
env_content = env_file.read()
assert f"export MAX_BATCH_SIZE={neuron_config.batch_size}" in env_content
assert (
f"export MAX_TOTAL_TOKENS={neuron_config.sequence_length}"
in env_content
)
assert f"export HF_NUM_CORES={neuron_config.tp_degree}" in env_content
if hasattr(neuron_config, "torch_dtype"):
auto_cast_type = str(map_torch_dtype(neuron_config.torch_dtype)).split(
"."
)[-1]
else:
auto_cast_type = neuron_config.auto_cast_type
assert f"export HF_AUTO_CAST_TYPE={auto_cast_type}" in env_content

View File

@ -9,7 +9,7 @@ touch $ENV_FILEPATH
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
${SCRIPT_DIR}/tgi_env.py $@
${SCRIPT_DIR}/tgi_entry_point.py $@
source $ENV_FILEPATH

View File

@ -0,0 +1,53 @@
#!/usr/bin/env python
import logging
import os
import sys
from text_generation_server.tgi_env import (
available_cores,
get_env_dict,
get_neuron_config_for_model,
neuron_config_to_env,
neuronxcc_version,
parse_cmdline_and_set_env,
tgi_env_vars,
)
logger = logging.getLogger(__name__)
def main():
"""
This script determines proper default TGI env variables for the neuron precompiled models to
work properly
:return:
"""
args = parse_cmdline_and_set_env()
for env_var in tgi_env_vars:
if not os.getenv(env_var):
break
else:
logger.info(
"All env vars %s already set, skipping, user know what they are doing",
tgi_env_vars,
)
sys.exit(0)
neuron_config = get_neuron_config_for_model(args.model_id, args.revision)
if not neuron_config:
msg = (
"No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
).format(get_env_dict(), available_cores, neuronxcc_version)
logger.error(msg)
raise Exception(msg)
neuron_config_to_env(neuron_config)
if __name__ == "__main__":
main()

View File

@ -28,15 +28,6 @@ logger = logging.getLogger(__file__)
# All model configurations below will be added to the neuron_model_config fixture
MODEL_CONFIGURATIONS = {
"gpt2": {
"model_id": "gpt2",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 1024,
"num_cores": 2,
"auto_cast_type": "fp16",
},
},
"llama": {
"model_id": "unsloth/Llama-3.2-1B-Instruct",
"export_kwargs": {
@ -46,15 +37,6 @@ MODEL_CONFIGURATIONS = {
"auto_cast_type": "fp16",
},
},
"mistral": {
"model_id": "optimum/mistral-1.1b-testing",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
},
"qwen2": {
"model_id": "Qwen/Qwen2.5-0.5B",
"export_kwargs": {

View File

@ -20,9 +20,7 @@ async def test_model_single_request(tgi_service):
)
assert response.details.generated_tokens == 17
greedy_expectations = {
"gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
"llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial",
"mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
"llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
"qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
"granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
}
@ -79,9 +77,7 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load):
assert len(responses) == 4
expectations = {
"gpt2": "Deep learning is a new field of research that has been around for a while",
"llama": "Deep learning is a subset of machine learning that uses artificial",
"mistral": "Deep Learning is a type of machine learning that",
"qwen2": "Deep Learning is a subset of Machine Learning that is based on",
"granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art",
}