text-generation-inference/backends/neuron/server/text_generation_server/server.py

import asyncio
from pathlib import Path
from typing import List

from grpc import aio
from grpc_reflection.v1alpha import reflection
from loguru import logger

from .generator import Generator, NeuronGenerator
from .interceptor import ExceptionInterceptor
from .pb import generate_pb2, generate_pb2_grpc


class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(self, generator: Generator, server_urls: List[str]):
        self.generator = generator
        self.server_urls = server_urls

    async def Info(self, request, context):
        return self.generator.info

    async def Health(self, request, context):
        return generate_pb2.HealthResponse()

    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)

    async def ClearCache(self, request, context):
        if request.HasField("id"):
            self.generator.clear(request.id)
        else:
            self.generator.clear()
        return generate_pb2.ClearCacheResponse()

    async def FilterBatch(self, request, context):
        filtered_batch = self.generator.filter(request.batch_id, request.request_ids)
        return generate_pb2.FilterBatchResponse(batch=filtered_batch)

    async def Warmup(self, request, context):
        max_tokens = self.generator.warmup(request.batch)
        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)

    async def Prefill(self, request, context):
        generations, batch = self.generator.prefill(request.batch)
        return generate_pb2.PrefillResponse(generations=generations, batch=batch)

    async def Decode(self, request, context):
        generations, batch = self.generator.decode(request.batches)
        return generate_pb2.DecodeResponse(generations=generations, batch=batch)


def serve(
    model_id: str,
    revision: str,
    uds_path: Path,
):
    async def serve_inner(model_id: str, revision: str):
        unix_socket_template = "unix://{}-{}"
        local_url = unix_socket_template.format(uds_path, 0)
        server_urls = [local_url]

        try:
            generator = NeuronGenerator.from_pretrained(model_id, revision)
        except Exception:
            logger.exception("Error when initializing model")
            raise

        server = aio.server(interceptors=[ExceptionInterceptor()])
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(generator, server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)

        await server.start()

        logger.info("Server started at {}".format(local_url))

        try:
            await server.wait_for_termination()
        except KeyboardInterrupt:
            logger.info("Signal received. Shutting down")
            await server.stop(0)

    asyncio.run(serve_inner(model_id, revision))
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`import asyncio`
			`from pathlib import Path`
			`from typing import List`

			`from grpc import aio`
			`from grpc_reflection.v1alpha import reflection`
			`from loguru import logger`

			`from .generator import Generator, NeuronGenerator`
			`from .interceptor import ExceptionInterceptor`
			`from .pb import generate_pb2, generate_pb2_grpc`


			`class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):`
			`def __init__(self, generator: Generator, server_urls: List[str]):`
			`self.generator = generator`
			`self.server_urls = server_urls`

			`async def Info(self, request, context):`
			`return self.generator.info`

			`async def Health(self, request, context):`
			`return generate_pb2.HealthResponse()`

			`async def ServiceDiscovery(self, request, context):`
			`return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)`

			`async def ClearCache(self, request, context):`
			`if request.HasField("id"):`
			`self.generator.clear(request.id)`
			`else:`
			`self.generator.clear()`
			`return generate_pb2.ClearCacheResponse()`

			`async def FilterBatch(self, request, context):`
			`filtered_batch = self.generator.filter(request.batch_id, request.request_ids)`
			`return generate_pb2.FilterBatchResponse(batch=filtered_batch)`

			`async def Warmup(self, request, context):`
			`max_tokens = self.generator.warmup(request.batch)`
			`return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)`

			`async def Prefill(self, request, context):`
			`generations, batch = self.generator.prefill(request.batch)`
			`return generate_pb2.PrefillResponse(generations=generations, batch=batch)`

			`async def Decode(self, request, context):`
			`generations, batch = self.generator.decode(request.batches)`
			`return generate_pb2.DecodeResponse(generations=generations, batch=batch)`


			`def serve(`
			`model_id: str,`
			`revision: str,`
			`uds_path: Path,`
			`):`
			`async def serve_inner(model_id: str, revision: str):`
			`unix_socket_template = "unix://{}-{}"`
			`local_url = unix_socket_template.format(uds_path, 0)`
			`server_urls = [local_url]`

			`try:`
			`generator = NeuronGenerator.from_pretrained(model_id, revision)`
			`except Exception:`
			`logger.exception("Error when initializing model")`
			`raise`

			`server = aio.server(interceptors=[ExceptionInterceptor()])`
			`generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(`
			`TextGenerationService(generator, server_urls), server`
			`)`
			`SERVICE_NAMES = (`
			`generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,`
			`reflection.SERVICE_NAME,`
			`)`
			`reflection.enable_server_reflection(SERVICE_NAMES, server)`
			`server.add_insecure_port(local_url)`

			`await server.start()`

			`logger.info("Server started at {}".format(local_url))`

			`try:`
			`await server.wait_for_termination()`
			`except KeyboardInterrupt:`
			`logger.info("Signal received. Shutting down")`
			`await server.stop(0)`

			`asyncio.run(serve_inner(model_id, revision))`