update readme

This commit is contained in:
OlivierDehaene 2023-03-07 18:43:00 +01:00
parent 478d5c1403
commit a3e57d3c5d
2 changed files with 18 additions and 23 deletions

View File

@ -13,7 +13,7 @@ server-dev:
cd server && make run-dev
router-dev:
cd router && cargo run
cd router && cargo run -- --port 8080
integration-tests: install-router install-launcher
cargo test
@ -22,16 +22,16 @@ python-tests:
cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
run-bloom-560m:
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
run-bloom-560m-quantize:
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize --port 8080
download-bloom:
HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom
run-bloom:
text-generation-launcher --model-id bigscience/bloom --num-shard 8
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080
run-bloom-quantize:
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080

View File

@ -89,40 +89,35 @@ You can then query the model using either the `/generate` or `/generate_stream`
```shell
curl 127.0.0.1:8080/generate \
-X POST \
-d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
-H 'Content-Type: application/json'
```
```shell
curl 127.0.0.1:8080/generate_stream \
-X POST \
-d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
-H 'Content-Type: application/json'
```
or from Python:
```python
import requests
result = requests.post("http://127.0.0.1:8080/generate", json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
print(result.json())
```
```shell
pip install sseclient-py
pip install text-generation
```
````python
import sseclient
import requests
```python
from text_generation import Client
r = requests.post("http://127.0.0.1:8080/generate_stream", stream=True, json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
sse_client = sseclient.SSEClient(r)
client = Client("http://127.0.0.1:8080")
print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text)
for i, event in enumerate(sse_client.events()):
print(i, event.data)
````
text = ""
for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17):
if not response.token.special:
text += response.token.text
print(text)
```
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).