mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
* feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
import sys
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from loguru import logger
|
|
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
@app.command()
|
|
def serve(
|
|
model_id: str,
|
|
revision: Optional[str] = None,
|
|
sharded: bool = False,
|
|
trust_remote_code: bool = None,
|
|
uds_path: str = "/tmp/text-generation-server",
|
|
logger_level: str = "INFO",
|
|
json_output: bool = False,
|
|
otlp_endpoint: Optional[str] = None,
|
|
otlp_service_name: str = "text-generation-inference.server",
|
|
max_input_tokens: Optional[int] = None,
|
|
):
|
|
"""This is the main entry-point for the server CLI.
|
|
|
|
Args:
|
|
model_id (`str`):
|
|
The *model_id* of a model on the HuggingFace hub or the path to a local model.
|
|
revision (`Optional[str]`, defaults to `None`):
|
|
The revision of the model on the HuggingFace hub.
|
|
sharded (`bool`):
|
|
Whether the model must be sharded or not. Kept for compatibility with the
|
|
text-generation-launcher, but must be set to False.
|
|
trust-remote-code (`bool`):
|
|
Kept for compatibility with text-generation-launcher. Ignored.
|
|
uds_path (`Union[Path, str]`):
|
|
The local path on which the server will expose its google RPC services.
|
|
logger_level (`str`):
|
|
The server logger level. Defaults to *INFO*.
|
|
json_output (`bool`):
|
|
Use JSON format for log serialization.
|
|
otlp_endpoint (`Optional[str]`, defaults to `None`):
|
|
The Open Telemetry endpoint to use.
|
|
otlp_service_name (`Optional[str]`, defaults to `None`):
|
|
The name to use when pushing data to the Open Telemetry endpoint.
|
|
max_input_tokens (`Optional[int]`, defaults to `None`):
|
|
The maximum number of input tokens each request should contain.
|
|
"""
|
|
if sharded:
|
|
raise ValueError("Sharding is not supported.")
|
|
# Remove default handler
|
|
logger.remove()
|
|
logger.add(
|
|
sys.stdout,
|
|
format="{message}",
|
|
filter="text_generation_server",
|
|
level=logger_level,
|
|
serialize=json_output,
|
|
backtrace=True,
|
|
diagnose=False,
|
|
)
|
|
|
|
if trust_remote_code is not None:
|
|
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
|
|
|
# Import here after the logger is added to log potential import exceptions
|
|
from .server import serve
|
|
|
|
serve(model_id, revision, uds_path)
|
|
|
|
|
|
@app.command()
|
|
def download_weights(
|
|
model_id: str,
|
|
revision: Optional[str] = None,
|
|
logger_level: str = "INFO",
|
|
json_output: bool = False,
|
|
auto_convert: Optional[bool] = None,
|
|
extension: Optional[str] = None,
|
|
trust_remote_code: Optional[bool] = None,
|
|
merge_lora: Optional[bool] = None,
|
|
):
|
|
"""Download the model weights.
|
|
|
|
This command will be called by text-generation-launcher before serving the model.
|
|
"""
|
|
# Remove default handler
|
|
logger.remove()
|
|
logger.add(
|
|
sys.stdout,
|
|
format="{message}",
|
|
filter="text_generation_server",
|
|
level=logger_level,
|
|
serialize=json_output,
|
|
backtrace=True,
|
|
diagnose=False,
|
|
)
|
|
|
|
if extension is not None:
|
|
logger.warning("'extension' argument is not supported and will be ignored.")
|
|
if trust_remote_code is not None:
|
|
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
|
if auto_convert is not None:
|
|
logger.warning("'auto_convert' argument is not supported and will be ignored.")
|
|
if merge_lora is not None:
|
|
logger.warning("'merge_lora' argument is not supported and will be ignored.")
|
|
|
|
# Import here after the logger is added to log potential import exceptions
|
|
from .model import fetch_model
|
|
|
|
fetch_model(model_id, revision)
|