mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
moved files
This commit is contained in:
parent
cd3349f53b
commit
a973cf4922
@ -61,4 +61,11 @@ python3 server/text_generation_server/cli.py serve bigscience/bloom-560m
|
||||
Launch Router
|
||||
```shell
|
||||
make router-dev
|
||||
```
|
||||
|
||||
Install FastAPI/Uvicorn
|
||||
|
||||
```shell
|
||||
pip install fastapi
|
||||
pip install "uvicorn[standard]"
|
||||
```
|
42
deepsparse/main.py
Normal file
42
deepsparse/main.py
Normal file
@ -0,0 +1,42 @@
|
||||
import uvicorn, fastapi
|
||||
from threading import Thread
|
||||
from queue import Queue
|
||||
|
||||
from router import DeepSparseRouter, batching_task
|
||||
from utils import GenerateRequest
|
||||
|
||||
TOKENIZER_PATH = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
|
||||
MODEL_PATH = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"
|
||||
|
||||
# setup router
|
||||
router = DeepSparseRouter(
|
||||
model_path=MODEL_PATH,
|
||||
tokenizer_path=TOKENIZER_PATH
|
||||
)
|
||||
|
||||
# start background routing task
|
||||
batching_thread = Thread(target=batching_task, args=[router])
|
||||
batching_thread.start()
|
||||
|
||||
app = fastapi.FastAPI()
|
||||
|
||||
@app.post("/generate")
|
||||
def generate(prompt:str, max_generated_tokens:int):
|
||||
response_stream = Queue()
|
||||
|
||||
# submit request to the router
|
||||
router.submit_request(
|
||||
generate_request=GenerateRequest(
|
||||
prompt=prompt,
|
||||
max_generated_tokens=max_generated_tokens,
|
||||
response_stream=response_stream
|
||||
)
|
||||
)
|
||||
|
||||
response_string = prompt
|
||||
generation = response_stream.get()
|
||||
while not generation.stopped:
|
||||
response_string += generation.token
|
||||
generation = response_stream.get()
|
||||
|
||||
return generation
|
@ -1,6 +1,7 @@
|
||||
from queue import Queue
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from server.deepsparse.service.service import DeepSparseService
|
||||
from server.deepsparse.service.causal_lm import DeepSparseCausalLM
|
||||
from server.deepsparse.utils import CachedBatch, Batch, Generation, GenerateRequest, Request
|
||||
|
||||
# TODO: implement logic for maximum size of the queue based on memory usage
|
||||
@ -47,11 +48,30 @@ class DeepSparseQueue:
|
||||
return (batch, generate_requests)
|
||||
|
||||
class DeepSparseRouter:
|
||||
def __init__(self, service: DeepSparseService):
|
||||
self.service: DeepSparseService = service
|
||||
def __init__(
|
||||
self,
|
||||
service: Optional[DeepSparseService],
|
||||
model_path: Optional[str],
|
||||
tokenizer_path: Optional[str]
|
||||
):
|
||||
assert (
|
||||
service is not None or
|
||||
(model_path is not None and tokenizer_path is not None)
|
||||
)
|
||||
|
||||
if service is not None:
|
||||
self.service = service
|
||||
else:
|
||||
self.service = DeepSparseService(
|
||||
model = DeepSparseCausalLM(
|
||||
model_path=model_path,
|
||||
tokenizer_path=tokenizer_path
|
||||
)
|
||||
)
|
||||
|
||||
self.queue: DeepSparseQueue = DeepSparseQueue()
|
||||
|
||||
def generate(self, generate_request: GenerateRequest):
|
||||
def submit_request(self, generate_request: GenerateRequest):
|
||||
self.queue.append(generate_request)
|
||||
|
||||
def prefill(
|
Loading…
Reference in New Issue
Block a user