update readme

2025-09-10 11:54:52 +00:00 · 2023-03-07 18:43:00 +01:00 · 2023-03-07 18:43:00 +01:00 · a3e57d3c5d
commit a3e57d3c5d
parent 478d5c1403
2 changed files with 18 additions and 23 deletions
--- a/10
+++ b/10
@ -13,7 +13,7 @@ server-dev:
 	cd server && make run-dev
 router-dev:
-	cd router && cargo run
+	cd router && cargo run -- --port 8080
 integration-tests: install-router install-launcher
 	cargo test
@ -22,16 +22,16 @@ python-tests:
 	cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
 run-bloom-560m:
-	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
 run-bloom-560m-quantize:
-	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize --port 8080
 download-bloom:
 	HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom
 run-bloom:
-	text-generation-launcher --model-id bigscience/bloom --num-shard 8
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080
 run-bloom-quantize:
-	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
--- a/README.md
+++ b/README.md
@ -89,40 +89,35 @@ You can then query the model using either the `/generate` or `/generate_stream`
 ```shell
 curl 127.0.0.1:8080/generate \
    -X POST \
-    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
    -H 'Content-Type: application/json'
 ```
 ```shell
 curl 127.0.0.1:8080/generate_stream \
    -X POST \
-    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
    -H 'Content-Type: application/json'
 ```
 or from Python:
 ```python
 import requests
 result = requests.post("http://127.0.0.1:8080/generate", json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
 print(result.json())
 ```
 ```shell
-pip install sseclient-py
+pip install text-generation
 ```
-````python
+```python
-import sseclient
+from text_generation import Client
 import requests
-r = requests.post("http://127.0.0.1:8080/generate_stream", stream=True, json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
+client = Client("http://127.0.0.1:8080")
-sse_client = sseclient.SSEClient(r)
+print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text)
-for i, event in enumerate(sse_client.events()):
+text = ""
-    print(i, event.data)
+for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17):
-````
+    if not response.token.special:
        text += response.token.text
 print(text)
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).