From 98d66f0534c76f11af5373890f67532ce2089cc3 Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav <vaibhavs10@gmail.com>
Date: Tue, 13 Aug 2024 16:57:17 +0200
Subject: [PATCH] More updates.

---
 docs/source/basic_tutorials/consuming_tgi.md  | 46 ++++------
 docs/source/basic_tutorials/using_guidance.md |  2 +-
 docs/source/conceptual/streaming.md           | 92 +++++++++++--------
 3 files changed, 69 insertions(+), 71 deletions(-)
diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md
index 1a9a4e8d..ce75cb64 100644
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@@ -101,22 +101,6 @@ print(next(iter(output)))
 
 You can check out the details of the function [here](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation). There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)
 
-
-## ChatUI
-
-ChatUI is an open-source interface built for LLM serving. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
-
-To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
-
-```
-{
-// rest of the model config here
-"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
-}
-```
-
-![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
-
 ## Gradio
 
 Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.
@@ -152,20 +136,7 @@ gr.ChatInterface(
 ).queue().launch()
 ```
 
-The UI looks like this 👇
-
-<div class="flex justify-center">
-    <img
-        class="block dark:hidden"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
-    />
-    <img
-        class="hidden dark:block"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
-    />
-</div>
-
-You can try the demo directly here 👇
+You can check out the UI and try the demo directly here 👇
 
 <div class="block dark:hidden">
 	<iframe
@@ -192,6 +163,21 @@ def inference(message, history):
 
 You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).
 
+## ChatUI
+
+[ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface built for consuming LLMs. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.
+
+To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.
+
+```
+{
+// rest of the model config here
+"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
+}
+```
+
+![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)
+
 ## API documentation
 
 You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md
index d0008fdb..f09ef348 100644
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -4,7 +4,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-
 
 These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
 
-_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `v1/chat/completions` endpoint._
 
 ## How it works
 
diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md
index 71ec9b25..8cf0e179 100644
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@@ -49,33 +49,29 @@ To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate o
 from huggingface_hub import InferenceClient
 
 client = InferenceClient("http://127.0.0.1:8080")
-for token in client.text_generation("How do you make cheese?", max_new_tokens=12, stream=True):
-    print(token)
+output = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Count to 10"},
+    ],
+    stream=True,
+    max_tokens=1024,
+)
 
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
-#.
-```
+for chunk in output:
+    print(chunk.choices[0].delta.content)
 
-If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
-
-```python
-for details in client.text_generation("How do you make cheese?", max_new_tokens=12, details=True, stream=True):
-    print(details)
-
-#TextGenerationStreamResponse(token=Token(id=193, text='\n', logprob=-0.007358551, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=2044, text='To', logprob=-1.1357422, special=False), generated_text=None, details=None)
-#TextGenerationStreamResponse(token=Token(id=717, text=' make', logprob=-0.009841919, special=False), generated_text=None, details=None)
-#...
-#TextGenerationStreamResponse(token=Token(id=25, text='.', logprob=-1.3408203, special=False), generated_text='\nTo make cheese, you need to start with milk.', details=StreamDetails(finish_reason=<FinishReason.Length: 'length'>, generated_tokens=12, seed=None))
+# 1
+# 2
+# 3
+# 4
+# 5
+# 6
+# 7
+# 8
+# 9
+# 10
 ```
 
 The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case you need to handle the requests concurrently.
@@ -84,30 +80,46 @@ The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case
 from huggingface_hub import AsyncInferenceClient
 
 client = AsyncInferenceClient("http://127.0.0.1:8080")
-async for token in await client.text_generation("How do you make cheese?", stream=True):
-    print(token)
+async def main():
+    stream = await client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say this is a test"}],
+        stream=True,
+    )
+    async for chunk in stream:
+        print(chunk.choices[0].delta.content or "", end="")
 
-# To
-# make
-# cheese
-#,
-# you
-# need
-# to
-# start
-# with
-# milk
+asyncio.run(main())
+
+# This
+# is
+# a
+# test
 #.
 ```
 
 ### Streaming with cURL
 
-To use the `generate_stream` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
+To use the OpenAI Chat Completions compatible Messages API `v1/chat/completions` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server
 
 ```curl
-curl -N 127.0.0.1:8080/generate_stream \
+curl localhost:3000/v1/chat/completions \
     -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
     -H 'Content-Type: application/json'
 ```