mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
moved files
This commit is contained in:
parent
cd3349f53b
commit
a973cf4922
@ -62,3 +62,10 @@ Launch Router
|
|||||||
```shell
|
```shell
|
||||||
make router-dev
|
make router-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Install FastAPI/Uvicorn
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install fastapi
|
||||||
|
pip install "uvicorn[standard]"
|
||||||
|
```
|
42
deepsparse/main.py
Normal file
42
deepsparse/main.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import uvicorn, fastapi
|
||||||
|
from threading import Thread
|
||||||
|
from queue import Queue
|
||||||
|
|
||||||
|
from router import DeepSparseRouter, batching_task
|
||||||
|
from utils import GenerateRequest
|
||||||
|
|
||||||
|
TOKENIZER_PATH = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/deployment"
|
||||||
|
MODEL_PATH = "/home/robertgshaw/.cache/sparsezoo/neuralmagic/codegen_mono-350m-bigpython_bigquery_thepile-base/model.onnx/model.onnx"
|
||||||
|
|
||||||
|
# setup router
|
||||||
|
router = DeepSparseRouter(
|
||||||
|
model_path=MODEL_PATH,
|
||||||
|
tokenizer_path=TOKENIZER_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
# start background routing task
|
||||||
|
batching_thread = Thread(target=batching_task, args=[router])
|
||||||
|
batching_thread.start()
|
||||||
|
|
||||||
|
app = fastapi.FastAPI()
|
||||||
|
|
||||||
|
@app.post("/generate")
|
||||||
|
def generate(prompt:str, max_generated_tokens:int):
|
||||||
|
response_stream = Queue()
|
||||||
|
|
||||||
|
# submit request to the router
|
||||||
|
router.submit_request(
|
||||||
|
generate_request=GenerateRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
max_generated_tokens=max_generated_tokens,
|
||||||
|
response_stream=response_stream
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
response_string = prompt
|
||||||
|
generation = response_stream.get()
|
||||||
|
while not generation.stopped:
|
||||||
|
response_string += generation.token
|
||||||
|
generation = response_stream.get()
|
||||||
|
|
||||||
|
return generation
|
@ -1,6 +1,7 @@
|
|||||||
from queue import Queue
|
from queue import Queue
|
||||||
from typing import List, Dict, Optional, Tuple
|
from typing import List, Dict, Optional, Tuple
|
||||||
from server.deepsparse.service.service import DeepSparseService
|
from server.deepsparse.service.service import DeepSparseService
|
||||||
|
from server.deepsparse.service.causal_lm import DeepSparseCausalLM
|
||||||
from server.deepsparse.utils import CachedBatch, Batch, Generation, GenerateRequest, Request
|
from server.deepsparse.utils import CachedBatch, Batch, Generation, GenerateRequest, Request
|
||||||
|
|
||||||
# TODO: implement logic for maximum size of the queue based on memory usage
|
# TODO: implement logic for maximum size of the queue based on memory usage
|
||||||
@ -47,11 +48,30 @@ class DeepSparseQueue:
|
|||||||
return (batch, generate_requests)
|
return (batch, generate_requests)
|
||||||
|
|
||||||
class DeepSparseRouter:
|
class DeepSparseRouter:
|
||||||
def __init__(self, service: DeepSparseService):
|
def __init__(
|
||||||
self.service: DeepSparseService = service
|
self,
|
||||||
|
service: Optional[DeepSparseService],
|
||||||
|
model_path: Optional[str],
|
||||||
|
tokenizer_path: Optional[str]
|
||||||
|
):
|
||||||
|
assert (
|
||||||
|
service is not None or
|
||||||
|
(model_path is not None and tokenizer_path is not None)
|
||||||
|
)
|
||||||
|
|
||||||
|
if service is not None:
|
||||||
|
self.service = service
|
||||||
|
else:
|
||||||
|
self.service = DeepSparseService(
|
||||||
|
model = DeepSparseCausalLM(
|
||||||
|
model_path=model_path,
|
||||||
|
tokenizer_path=tokenizer_path
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
self.queue: DeepSparseQueue = DeepSparseQueue()
|
self.queue: DeepSparseQueue = DeepSparseQueue()
|
||||||
|
|
||||||
def generate(self, generate_request: GenerateRequest):
|
def submit_request(self, generate_request: GenerateRequest):
|
||||||
self.queue.append(generate_request)
|
self.queue.append(generate_request)
|
||||||
|
|
||||||
def prefill(
|
def prefill(
|
Loading…
Reference in New Issue
Block a user