mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
* feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
227 lines
8.2 KiB
Python
Executable File
227 lines
8.2 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from huggingface_hub import constants
|
|
from transformers import AutoConfig
|
|
|
|
from optimum.neuron.modeling_decoder import get_available_cores
|
|
from optimum.neuron.utils import get_hub_cached_entries
|
|
from optimum.neuron.utils.version_utils import get_neuronxcc_version
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
|
|
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
|
|
|
|
env_config_peering = [
|
|
("MAX_BATCH_SIZE", "batch_size"),
|
|
("MAX_TOTAL_TOKENS", "sequence_length"),
|
|
("HF_AUTO_CAST_TYPE", "auto_cast_type"),
|
|
("HF_NUM_CORES", "num_cores"),
|
|
]
|
|
|
|
# By the end of this script all env var should be specified properly
|
|
env_vars = tgi_server_env_vars + tgi_router_env_vars
|
|
|
|
available_cores = get_available_cores()
|
|
neuronxcc_version = get_neuronxcc_version()
|
|
|
|
|
|
def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser()
|
|
if not argv:
|
|
argv = sys.argv
|
|
# All these are params passed to tgi and intercepted here
|
|
parser.add_argument(
|
|
"--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
|
|
)
|
|
parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
|
|
parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
|
|
parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
|
|
parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
|
|
parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
|
|
|
|
args = parser.parse_known_args(argv)[0]
|
|
|
|
if not args.model_id:
|
|
raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
|
|
|
|
# Override env with cmdline params
|
|
os.environ["MODEL_ID"] = args.model_id
|
|
|
|
# Set all tgi router and tgi server values to consistent values as early as possible
|
|
# from the order of the parser defaults, the tgi router value can override the tgi server ones
|
|
if args.max_total_tokens > 0:
|
|
os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
|
|
|
|
if args.max_input_tokens > 0:
|
|
os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
|
|
|
|
if args.max_batch_size > 0:
|
|
os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
|
|
|
|
if args.max_batch_prefill_tokens > 0:
|
|
os.environ["MAX_BATCH_PREFILL_TOKENS"] = str(args.max_batch_prefill_tokens)
|
|
|
|
if args.revision:
|
|
os.environ["REVISION"] = str(args.revision)
|
|
|
|
return args
|
|
|
|
|
|
def neuron_config_to_env(neuron_config):
|
|
with open(os.environ["ENV_FILEPATH"], "w") as f:
|
|
for env_var, config_key in env_config_peering:
|
|
f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
|
|
max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
|
|
if not max_input_tokens:
|
|
max_input_tokens = int(neuron_config["sequence_length"]) // 2
|
|
if max_input_tokens == 0:
|
|
raise Exception("Model sequence length should be greater than 1")
|
|
f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
|
|
max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
|
|
if not max_batch_prefill_tokens:
|
|
max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
|
|
f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
|
|
|
|
|
|
def sort_neuron_configs(dictionary):
|
|
return -dictionary["num_cores"], -dictionary["batch_size"]
|
|
|
|
|
|
def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
# Reuse the same mechanic as the one in use to configure the tgi server part
|
|
# The only difference here is that we stay as flexible as possible on the compatibility part
|
|
entries = get_hub_cached_entries(model_id, "inference")
|
|
|
|
logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
|
|
|
|
all_compatible = []
|
|
for entry in entries:
|
|
if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
|
|
all_compatible.append(entry)
|
|
|
|
if not all_compatible:
|
|
logger.debug(
|
|
"No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
|
|
model_id,
|
|
get_env_dict(),
|
|
available_cores,
|
|
neuronxcc_version,
|
|
)
|
|
return None
|
|
|
|
logger.info("%d compatible neuron cached models found", len(all_compatible))
|
|
|
|
all_compatible = sorted(all_compatible, key=sort_neuron_configs)
|
|
|
|
entry = all_compatible[0]
|
|
|
|
return entry
|
|
|
|
|
|
def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
|
|
logger.debug(
|
|
"Checking the provided neuron config %s is compatible with the local setup and provided environment",
|
|
neuron_config,
|
|
)
|
|
|
|
# Local setup compat checks
|
|
if neuron_config["num_cores"] > available_cores:
|
|
logger.debug("Not enough neuron cores available to run the provided neuron config")
|
|
return False
|
|
|
|
if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
|
|
logger.debug(
|
|
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
|
|
neuronxcc_version,
|
|
neuron_config["compiler_version"],
|
|
)
|
|
return False
|
|
|
|
for env_var, config_key in env_config_peering:
|
|
neuron_config_value = str(neuron_config[config_key])
|
|
env_value = os.getenv(env_var, str(neuron_config_value))
|
|
if env_value != neuron_config_value:
|
|
logger.debug(
|
|
"The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
|
|
env_var,
|
|
config_key,
|
|
env_value,
|
|
neuron_config_value,
|
|
)
|
|
return False
|
|
|
|
max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
|
|
if max_input_tokens > 0:
|
|
sequence_length = neuron_config["sequence_length"]
|
|
if max_input_tokens >= sequence_length:
|
|
logger.debug(
|
|
"Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
|
|
max_input_tokens,
|
|
sequence_length,
|
|
)
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def get_env_dict() -> Dict[str, str]:
|
|
d = {}
|
|
for k in env_vars:
|
|
d[k] = os.getenv(k)
|
|
return d
|
|
|
|
|
|
def main():
|
|
"""
|
|
This script determines proper default TGI env variables for the neuron precompiled models to
|
|
work properly
|
|
:return:
|
|
"""
|
|
args = parse_cmdline_and_set_env()
|
|
|
|
for env_var in env_vars:
|
|
if not os.getenv(env_var):
|
|
break
|
|
else:
|
|
logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
|
|
sys.exit(0)
|
|
|
|
cache_dir = constants.HF_HUB_CACHE
|
|
|
|
logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
|
|
|
|
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
|
|
neuron_config = getattr(config, "neuron", None)
|
|
if neuron_config is not None:
|
|
compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
|
|
if not compatible:
|
|
env_dict = get_env_dict()
|
|
msg = (
|
|
"Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
|
|
).format(neuron_config, env_dict, available_cores, neuronxcc_version)
|
|
logger.error(msg)
|
|
raise Exception(msg)
|
|
else:
|
|
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
|
|
|
|
if not neuron_config:
|
|
msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
|
|
get_env_dict(), available_cores, neuronxcc_version
|
|
)
|
|
logger.error(msg)
|
|
raise Exception(msg)
|
|
|
|
neuron_config_to_env(neuron_config)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|