mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 03:44:54 +00:00
update readme
This commit is contained in:
parent
478d5c1403
commit
a3e57d3c5d
10
Makefile
10
Makefile
@ -13,7 +13,7 @@ server-dev:
|
||||
cd server && make run-dev
|
||||
|
||||
router-dev:
|
||||
cd router && cargo run
|
||||
cd router && cargo run -- --port 8080
|
||||
|
||||
integration-tests: install-router install-launcher
|
||||
cargo test
|
||||
@ -22,16 +22,16 @@ python-tests:
|
||||
cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
|
||||
|
||||
run-bloom-560m:
|
||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2
|
||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
|
||||
|
||||
run-bloom-560m-quantize:
|
||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
|
||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize --port 8080
|
||||
|
||||
download-bloom:
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom
|
||||
|
||||
run-bloom:
|
||||
text-generation-launcher --model-id bigscience/bloom --num-shard 8
|
||||
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080
|
||||
|
||||
run-bloom-quantize:
|
||||
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize
|
||||
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
|
31
README.md
31
README.md
@ -89,40 +89,35 @@ You can then query the model using either the `/generate` or `/generate_stream`
|
||||
```shell
|
||||
curl 127.0.0.1:8080/generate \
|
||||
-X POST \
|
||||
-d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
```shell
|
||||
curl 127.0.0.1:8080/generate_stream \
|
||||
-X POST \
|
||||
-d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
or from Python:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
result = requests.post("http://127.0.0.1:8080/generate", json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
|
||||
print(result.json())
|
||||
```
|
||||
|
||||
```shell
|
||||
pip install sseclient-py
|
||||
pip install text-generation
|
||||
```
|
||||
|
||||
````python
|
||||
import sseclient
|
||||
import requests
|
||||
```python
|
||||
from text_generation import Client
|
||||
|
||||
r = requests.post("http://127.0.0.1:8080/generate_stream", stream=True, json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
|
||||
sse_client = sseclient.SSEClient(r)
|
||||
client = Client("http://127.0.0.1:8080")
|
||||
print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text)
|
||||
|
||||
for i, event in enumerate(sse_client.events()):
|
||||
print(i, event.data)
|
||||
````
|
||||
text = ""
|
||||
for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17):
|
||||
if not response.token.special:
|
||||
text += response.token.text
|
||||
print(text)
|
||||
```
|
||||
|
||||
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user