From 8de10acdcfb640b7db44cf0595146877f4a8594d Mon Sep 17 00:00:00 2001 From: Vaibhav Srivastav Date: Tue, 13 Aug 2024 15:47:25 +0200 Subject: [PATCH] Improve the Consuming TGI docs. --- docs/openapi.json | 18 ++------------- docs/source/basic_tutorials/consuming_tgi.md | 23 ++++++++++++++++---- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index df21e19d..9d281a48 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -819,13 +819,6 @@ "example": "1.0", "nullable": true }, - "guideline": { - "type": "string", - "description": "A guideline to be used in the chat_template", - "default": "null", - "example": "null", - "nullable": true - }, "logit_bias": { "type": "array", "items": { @@ -1824,8 +1817,7 @@ "type": "object", "required": [ "finish_reason", - "generated_tokens", - "input_length" + "generated_tokens" ], "properties": { "finish_reason": { @@ -1837,12 +1829,6 @@ "example": 1, "minimum": 0 }, - "input_length": { - "type": "integer", - "format": "int32", - "example": 1, - "minimum": 0 - }, "seed": { "type": "integer", "format": "int64", @@ -2094,4 +2080,4 @@ "description": "Hugging Face Text Generation Inference API" } ] -} +} \ No newline at end of file diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md index 4829ec7c..cda1f5ab 100644 --- a/docs/source/basic_tutorials/consuming_tgi.md +++ b/docs/source/basic_tutorials/consuming_tgi.md @@ -1,18 +1,33 @@ # Consuming Text Generation Inference -There are many ways you can consume Text Generation Inference server in your applications. After launching, you can use the `/generate` route and make a `POST` request to get results from the server. You can also use the `/generate_stream` route if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScrpt. For a final end-to-end experience, we also open-sourced ChatUI, a chat interface for open-source models. +There are many ways to consume Text Generation Inference (TGI) server in your applications. After launching the server, you can use the [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) `/v1/chat/completions` route and make a `POST` request to get results from the server. You can also pass `"stream": true` to the call if you want TGI to return a stream of tokens. You can make the requests using the tool of your preference, such as curl, Python or TypeScript. For a final end-to-end experience, we have also open-sourced ChatUI, a chat interface for open-source models. ## curl -After the launch, you can query the model using either the `/generate` or `/generate_stream` routes: +After a successful server launch, you can query the model using the `v1/chat/completions` route to get OpenAI Chat Completion API spec compliant responses: ```bash -curl 127.0.0.1:8080/generate \ +curl localhost:3000/v1/chat/completions \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -d '{ + "model": "tgi", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is deep learning?" + } + ], + "stream": true, + "max_tokens": 20 +}' \ -H 'Content-Type: application/json' ``` +You can update the `stream` parameter to `false` to get a non-streaming response. ## Inference Client