2024-04-08 16:06:21 +00:00
/// Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
2023-02-17 17:22:00 +00:00
use axum ::http ::HeaderValue ;
2022-10-18 13:19:03 +00:00
use clap ::Parser ;
2024-01-10 15:35:54 +00:00
use hf_hub ::api ::tokio ::{ Api , ApiBuilder , ApiRepo } ;
use hf_hub ::{ Repo , RepoType } ;
2023-02-13 12:02:45 +00:00
use opentelemetry ::sdk ::propagation ::TraceContextPropagator ;
use opentelemetry ::sdk ::trace ;
use opentelemetry ::sdk ::trace ::Sampler ;
use opentelemetry ::sdk ::Resource ;
use opentelemetry ::{ global , KeyValue } ;
use opentelemetry_otlp ::WithExportConfig ;
2024-01-19 14:12:04 +00:00
use std ::env ;
2024-01-10 15:35:54 +00:00
use std ::fs ::File ;
use std ::io ::BufReader ;
2022-10-17 16:27:33 +00:00
use std ::net ::{ IpAddr , Ipv4Addr , SocketAddr } ;
2023-03-06 13:39:36 +00:00
use std ::path ::Path ;
2023-07-10 12:47:15 +00:00
use text_generation_client ::{ ClientError , ShardedClient } ;
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
use text_generation_router ::{ server , HubModelInfo , HubTokenizerConfig } ;
2023-07-10 12:47:15 +00:00
use thiserror ::Error ;
2024-01-10 15:35:54 +00:00
use tokenizers ::Tokenizer ;
2023-02-17 17:22:00 +00:00
use tower_http ::cors ::AllowOrigin ;
2023-02-13 12:02:45 +00:00
use tracing_subscriber ::layer ::SubscriberExt ;
use tracing_subscriber ::util ::SubscriberInitExt ;
use tracing_subscriber ::{ EnvFilter , Layer } ;
2022-10-17 16:27:33 +00:00
/// App Configuration
#[ derive(Parser, Debug) ]
#[ clap(author, version, about, long_about = None) ]
struct Args {
2022-10-18 13:19:03 +00:00
#[ clap(default_value = " 128 " , long, env) ]
max_concurrent_requests : usize ,
2023-03-09 14:30:54 +00:00
#[ clap(default_value = " 2 " , long, env) ]
max_best_of : usize ,
2023-02-15 20:56:59 +00:00
#[ clap(default_value = " 4 " , long, env) ]
max_stop_sequences : usize ,
2023-08-28 09:43:47 +00:00
#[ clap(default_value = " 5 " , long, env) ]
max_top_n_tokens : u32 ,
2023-06-30 18:07:49 +00:00
#[ clap(default_value = " 1024 " , long, env) ]
2022-10-18 13:19:03 +00:00
max_input_length : usize ,
2023-06-30 18:07:49 +00:00
#[ clap(default_value = " 2048 " , long, env) ]
2023-02-15 20:56:59 +00:00
max_total_tokens : usize ,
2023-04-24 15:59:00 +00:00
#[ clap(default_value = " 1.2 " , long, env) ]
waiting_served_ratio : f32 ,
2023-06-30 17:09:59 +00:00
#[ clap(default_value = " 4096 " , long, env) ]
max_batch_prefill_tokens : u32 ,
2023-07-19 07:31:25 +00:00
#[ clap(long, env) ]
max_batch_total_tokens : Option < u32 > ,
2022-10-21 14:40:05 +00:00
#[ clap(default_value = " 20 " , long, env) ]
max_waiting_tokens : usize ,
2024-02-09 11:38:41 +00:00
#[ clap(long, env) ]
max_batch_size : Option < usize > ,
2023-07-05 16:28:45 +00:00
#[ clap(default_value = " 0.0.0.0 " , long, env) ]
hostname : String ,
2022-10-17 16:27:33 +00:00
#[ clap(default_value = " 3000 " , long, short, env) ]
port : u16 ,
2023-04-09 18:22:27 +00:00
#[ clap(default_value = " /tmp/text-generation-server-0 " , long, env) ]
2022-10-18 13:19:03 +00:00
master_shard_uds_path : String ,
2022-10-17 16:27:33 +00:00
#[ clap(default_value = " bigscience/bloom " , long, env) ]
tokenizer_name : String ,
2023-07-13 16:59:38 +00:00
#[ clap(long, env) ]
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
tokenizer_config_path : Option < String > ,
#[ clap(long, env) ]
2023-07-13 16:59:38 +00:00
revision : Option < String > ,
2022-10-18 13:19:03 +00:00
#[ clap(default_value = " 2 " , long, env) ]
validation_workers : usize ,
2022-11-02 16:29:56 +00:00
#[ clap(long, env) ]
json_output : bool ,
2023-02-13 12:02:45 +00:00
#[ clap(long, env) ]
otlp_endpoint : Option < String > ,
2023-02-17 17:22:00 +00:00
#[ clap(long, env) ]
cors_allow_origin : Option < Vec < String > > ,
2023-06-16 14:25:11 +00:00
#[ clap(long, env) ]
ngrok : bool ,
#[ clap(long, env) ]
ngrok_authtoken : Option < String > ,
#[ clap(long, env) ]
2023-07-19 09:59:58 +00:00
ngrok_edge : Option < String > ,
2024-01-22 15:29:01 +00:00
#[ clap(long, env, default_value_t = false) ]
2024-01-24 16:41:28 +00:00
messages_api_enabled : bool ,
2024-02-15 09:28:10 +00:00
#[ clap(long, env, default_value_t = false) ]
disable_grammar_support : bool ,
2022-10-17 16:27:33 +00:00
}
2022-10-08 10:30:12 +00:00
2024-01-10 15:35:54 +00:00
#[ tokio::main ]
async fn main ( ) -> Result < ( ) , RouterError > {
2022-10-17 16:27:33 +00:00
// Get args
let args = Args ::parse ( ) ;
2022-10-18 13:19:03 +00:00
// Pattern match configuration
2022-10-17 16:27:33 +00:00
let Args {
2022-10-18 13:19:03 +00:00
max_concurrent_requests ,
2023-03-09 14:30:54 +00:00
max_best_of ,
2023-02-15 20:56:59 +00:00
max_stop_sequences ,
2023-08-28 09:43:47 +00:00
max_top_n_tokens ,
2022-10-18 13:19:03 +00:00
max_input_length ,
2023-02-15 20:56:59 +00:00
max_total_tokens ,
2023-04-24 15:59:00 +00:00
waiting_served_ratio ,
2023-06-30 17:09:59 +00:00
max_batch_prefill_tokens ,
max_batch_total_tokens ,
2022-10-21 14:40:05 +00:00
max_waiting_tokens ,
2024-02-09 11:38:41 +00:00
max_batch_size ,
2023-07-05 16:28:45 +00:00
hostname ,
2022-10-17 16:27:33 +00:00
port ,
2022-10-18 13:19:03 +00:00
master_shard_uds_path ,
2022-10-17 16:27:33 +00:00
tokenizer_name ,
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
tokenizer_config_path ,
2023-04-18 14:16:06 +00:00
revision ,
2022-10-18 13:19:03 +00:00
validation_workers ,
2022-11-02 16:29:56 +00:00
json_output ,
2023-02-13 12:02:45 +00:00
otlp_endpoint ,
2023-02-17 17:22:00 +00:00
cors_allow_origin ,
2023-06-16 14:25:11 +00:00
ngrok ,
ngrok_authtoken ,
2023-07-19 09:59:58 +00:00
ngrok_edge ,
2024-01-24 16:41:28 +00:00
messages_api_enabled ,
2024-02-15 09:28:10 +00:00
disable_grammar_support ,
2022-10-17 16:27:33 +00:00
} = args ;
2024-01-10 15:35:54 +00:00
// Launch Tokio runtime
init_logging ( otlp_endpoint , json_output ) ;
2023-06-30 18:07:49 +00:00
// Validate args
2023-07-13 12:22:37 +00:00
if max_input_length > = max_total_tokens {
return Err ( RouterError ::ArgumentValidation (
" `max_input_length` must be < `max_total_tokens` " . to_string ( ) ,
) ) ;
}
2023-06-30 18:07:49 +00:00
if max_input_length as u32 > max_batch_prefill_tokens {
2023-07-13 12:22:37 +00:00
return Err ( RouterError ::ArgumentValidation ( format! ( " `max_batch_prefill_tokens` must be >= `max_input_length`. Given: {max_batch_prefill_tokens} and {max_input_length} " ) ) ) ;
2023-06-30 18:07:49 +00:00
}
2023-07-19 07:31:25 +00:00
2022-11-02 16:29:56 +00:00
if validation_workers = = 0 {
2023-07-13 12:22:37 +00:00
return Err ( RouterError ::ArgumentValidation (
" `validation_workers` must be > 0 " . to_string ( ) ,
) ) ;
2022-10-18 13:19:03 +00:00
}
2023-07-19 07:31:25 +00:00
if let Some ( ref max_batch_total_tokens ) = max_batch_total_tokens {
if max_batch_prefill_tokens > * max_batch_total_tokens {
return Err ( RouterError ::ArgumentValidation ( format! ( " `max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens} " ) ) ) ;
}
if max_total_tokens as u32 > * max_batch_total_tokens {
return Err ( RouterError ::ArgumentValidation ( format! ( " `max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens} " ) ) ) ;
}
}
2024-02-09 11:38:41 +00:00
let ( max_batch_size , max_batch_total_tokens ) = match ( max_batch_size , max_batch_total_tokens ) {
( Some ( _max_batch_size ) , Some ( _max_batch_total_tokens ) ) = > {
if ( _max_batch_total_tokens as usize / max_total_tokens ) ! = _max_batch_size {
tracing ::warn! ( " max_batch_size was set to {_max_batch_size} while max_batch_total_tokens to {_max_batch_total_tokens} " ) ;
tracing ::warn! ( " These values are not match, so max_batch_size will be preferred " ) ;
( Some ( _max_batch_size ) , Some ( ( _max_batch_size * max_total_tokens ) as u32 ) )
} else {
( Some ( _max_batch_size ) , Some ( _max_batch_total_tokens ) )
}
} ,
( Some ( _max_batch_size ) , None ) = > (
Some ( _max_batch_size ) , Some ( ( _max_batch_size * max_total_tokens ) as u32 )
) ,
( None , Some ( _max_batch_total_tokens ) ) = > (
Some ( _max_batch_total_tokens as usize / max_total_tokens ) , Some ( _max_batch_total_tokens )
) ,
( None , None ) = > ( None , None ) ,
} ;
2023-02-17 17:22:00 +00:00
// CORS allowed origins
// map to go inside the option and then map to parse from String to HeaderValue
// Finally, convert to AllowOrigin
let cors_allow_origin : Option < AllowOrigin > = cors_allow_origin . map ( | cors_allow_origin | {
AllowOrigin ::list (
cors_allow_origin
. iter ( )
. map ( | origin | origin . parse ::< HeaderValue > ( ) . unwrap ( ) ) ,
)
} ) ;
2023-04-19 18:06:06 +00:00
// Parse Huggingface hub token
let authorization_token = std ::env ::var ( " HUGGING_FACE_HUB_TOKEN " ) . ok ( ) ;
2023-03-06 13:39:36 +00:00
// Tokenizer instance
2022-10-18 13:19:03 +00:00
// This will only be used to validate payloads
2023-03-06 13:39:36 +00:00
let local_path = Path ::new ( & tokenizer_name ) ;
2023-04-18 14:16:06 +00:00
let local_model = local_path . exists ( ) & & local_path . is_dir ( ) ;
2024-01-10 15:35:54 +00:00
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
// Shared API builder initialization
let api_builder = | | {
2024-01-10 15:35:54 +00:00
let mut builder = ApiBuilder ::new ( )
. with_progress ( false )
. with_token ( authorization_token ) ;
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
if let Ok ( cache_dir ) = std ::env ::var ( " HUGGINGFACE_HUB_CACHE " ) {
2024-01-10 15:35:54 +00:00
builder = builder . with_cache_dir ( cache_dir . into ( ) ) ;
}
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
builder
} ;
// Decide if we need to use the API based on the revision and local path
let use_api = revision . is_some ( ) | | ! local_path . exists ( ) | | ! local_path . is_dir ( ) ;
// Initialize API if needed
let api = if use_api {
tracing ::info! ( " Using the Hugging Face API " ) ;
match api_builder ( ) . build ( ) {
Ok ( api ) = > Some ( api ) ,
Err ( _ ) = > {
tracing ::warn! ( " Unable to build the Hugging Face API " ) ;
None
}
2024-01-10 15:35:54 +00:00
}
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
} else {
None
} ;
// Load tokenizer and model info
let skip_tokenizer_in_tgi = env ::var ( " SKIP_TOKENIZER_IN_TGI " )
. ok ( )
. map_or ( false , | value | value . to_lowercase ( ) = = " true " ) ;
let ( tokenizer , model_info ) = if local_model {
let tokenizer = if skip_tokenizer_in_tgi {
None
} else {
Tokenizer ::from_file ( local_path . join ( " tokenizer.json " ) ) . ok ( )
} ;
let model_info = HubModelInfo {
model_id : tokenizer_name . to_string ( ) ,
sha : None ,
pipeline_tag : None ,
} ;
2024-01-10 15:35:54 +00:00
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
( tokenizer , model_info )
} else if let Some ( api ) = api . clone ( ) {
2024-01-10 15:35:54 +00:00
let api_repo = api . repo ( Repo ::with_revision (
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
tokenizer_name . to_string ( ) ,
2024-01-10 15:35:54 +00:00
RepoType ::Model ,
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
revision . clone ( ) . unwrap_or_else ( | | " main " . to_string ( ) ) ,
2024-01-10 15:35:54 +00:00
) ) ;
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
let tokenizer = match api_repo . get ( " tokenizer.json " ) . await {
Ok ( tokenizer_filename ) = > Tokenizer ::from_file ( tokenizer_filename ) . ok ( ) ,
Err ( _ ) = > get_base_tokenizer ( & api , & api_repo ) . await ,
} ;
2024-01-10 15:35:54 +00:00
let model_info = get_model_info ( & api_repo ) . await . unwrap_or_else ( | | {
tracing ::warn! ( " Could not retrieve model info from the Hugging Face hub. " ) ;
HubModelInfo {
model_id : tokenizer_name . to_string ( ) ,
sha : None ,
pipeline_tag : None ,
}
} ) ;
( tokenizer , model_info )
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
} else {
// No API and no local model
return Err ( RouterError ::ArgumentValidation (
" No local model found and no revision specified " . to_string ( ) ,
) ) ;
} ;
// Load tokenizer config if found locally, or check if we can get it from the API if needed
2024-02-01 14:39:32 +00:00
let tokenizer_config = if let Some ( path ) = tokenizer_config_path {
tracing ::info! ( " Using local tokenizer config from user specified path " ) ;
HubTokenizerConfig ::from_file ( & std ::path ::PathBuf ::from ( path ) )
} else if local_model {
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
tracing ::info! ( " Using local tokenizer config " ) ;
2024-02-01 14:39:32 +00:00
HubTokenizerConfig ::from_file ( & local_path . join ( " tokenizer_config.json " ) )
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
} else {
2024-02-01 14:39:32 +00:00
match api {
Some ( api ) = > {
tracing ::info! ( " Using the Hugging Face API to retrieve tokenizer config " ) ;
let repo = Repo ::with_revision (
tokenizer_name . to_string ( ) ,
RepoType ::Model ,
revision . unwrap_or ( " main " . to_string ( ) ) ,
) ;
get_tokenizer_config ( & api . repo ( repo ) )
. await
. unwrap_or_else ( | | {
tracing ::warn! (
" Could not retrieve tokenizer config from the Hugging Face hub. "
) ;
HubTokenizerConfig ::default ( )
} )
}
None = > {
tracing ::warn! ( " Could not find tokenizer config locally and no API specified " ) ;
HubTokenizerConfig ::default ( )
}
}
2023-04-18 14:16:06 +00:00
} ;
2022-10-11 14:50:54 +00:00
2024-01-10 15:35:54 +00:00
if tokenizer . is_none ( ) {
tracing ::warn! ( " Could not find a fast tokenizer implementation for {tokenizer_name} " ) ;
tracing ::warn! ( " Rust input length validation and truncation is disabled " ) ;
}
// if pipeline-tag == text-generation we default to return_full_text = true
let compat_return_full_text = match & model_info . pipeline_tag {
None = > {
tracing ::warn! ( " no pipeline tag found for model {tokenizer_name} " ) ;
2024-02-21 13:50:57 +00:00
true
2024-01-10 15:35:54 +00:00
}
Some ( pipeline_tag ) = > pipeline_tag . as_str ( ) = = " text-generation " ,
} ;
// Instantiate sharded client from the master unix socket
let mut sharded_client = ShardedClient ::connect_uds ( master_shard_uds_path )
. await
. map_err ( RouterError ::Connection ) ? ;
// Clear the cache; useful if the webserver rebooted
sharded_client
. clear_cache ( None )
. await
. map_err ( RouterError ::Cache ) ? ;
// Get info from the shard
let shard_info = sharded_client . info ( ) . await . map_err ( RouterError ::Info ) ? ;
// Warmup model
tracing ::info! ( " Warming up model " ) ;
let max_supported_batch_total_tokens = match sharded_client
. warmup (
max_input_length as u32 ,
max_batch_prefill_tokens ,
max_total_tokens as u32 ,
2024-02-09 11:38:41 +00:00
max_batch_size ,
2024-01-10 15:35:54 +00:00
)
. await
. map_err ( RouterError ::Warmup ) ?
{
// Older models do not support automatic max-batch-total-tokens
None = > {
let max_batch_total_tokens = max_batch_total_tokens
. unwrap_or ( 16000. max ( ( max_total_tokens as u32 ) . max ( max_batch_prefill_tokens ) ) ) ;
tracing ::warn! ( " Model does not support automatic max batch total tokens " ) ;
max_batch_total_tokens
}
// Flash attention models return their max supported total tokens
Some ( max_supported_batch_total_tokens ) = > {
// Warn if user added his own max-batch-total-tokens as we will ignore it
if max_batch_total_tokens . is_some ( ) {
tracing ::warn! (
" `--max-batch-total-tokens` is deprecated for Flash \
Attention models . "
) ;
2023-04-09 18:22:27 +00:00
tracing ::warn! (
2024-01-10 15:35:54 +00:00
" Inferred max batch total tokens: {max_supported_batch_total_tokens} "
2023-04-09 18:22:27 +00:00
) ;
2024-01-10 15:35:54 +00:00
}
if max_total_tokens as u32 > max_supported_batch_total_tokens {
return Err ( RouterError ::ArgumentValidation ( format! ( " `max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_supported_batch_total_tokens} " ) ) ) ;
2023-04-09 18:22:27 +00:00
}
2024-01-10 15:35:54 +00:00
max_supported_batch_total_tokens
}
} ;
tracing ::info! ( " Setting max batch total tokens to {max_supported_batch_total_tokens} " ) ;
tracing ::info! ( " Connected " ) ;
2024-02-20 13:04:51 +00:00
// Determine the server port based on the feature and environment variable.
let port = if cfg! ( feature = " google " ) {
std ::env ::var ( " AIP_HTTP_PORT " )
. map ( | aip_http_port | aip_http_port . parse ::< u16 > ( ) . unwrap_or ( port ) )
. unwrap_or ( port )
} else {
port
} ;
2024-01-10 15:35:54 +00:00
let addr = match hostname . parse ( ) {
Ok ( ip ) = > SocketAddr ::new ( ip , port ) ,
Err ( _ ) = > {
tracing ::warn! ( " Invalid hostname, defaulting to 0.0.0.0 " ) ;
SocketAddr ::new ( IpAddr ::V4 ( Ipv4Addr ::new ( 0 , 0 , 0 , 0 ) ) , port )
}
} ;
// Run server
server ::run (
model_info ,
shard_info ,
compat_return_full_text ,
max_concurrent_requests ,
max_best_of ,
max_stop_sequences ,
max_top_n_tokens ,
max_input_length ,
max_total_tokens ,
waiting_served_ratio ,
max_batch_prefill_tokens ,
max_supported_batch_total_tokens ,
max_waiting_tokens ,
2024-02-09 11:38:41 +00:00
max_batch_size ,
2024-01-10 15:35:54 +00:00
sharded_client ,
tokenizer ,
validation_workers ,
addr ,
cors_allow_origin ,
ngrok ,
ngrok_authtoken ,
ngrok_edge ,
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
tokenizer_config ,
2024-01-24 16:41:28 +00:00
messages_api_enabled ,
2024-02-15 09:28:10 +00:00
disable_grammar_support ,
2024-01-10 15:35:54 +00:00
)
. await ? ;
Ok ( ( ) )
2022-10-08 10:30:12 +00:00
}
2023-02-13 12:02:45 +00:00
/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
/// - otlp_endpoint is an optional URL to an Open Telemetry collector
/// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
/// - LOG_FORMAT may be TEXT or JSON (default to TEXT)
fn init_logging ( otlp_endpoint : Option < String > , json_output : bool ) {
let mut layers = Vec ::new ( ) ;
// STDOUT/STDERR layer
let fmt_layer = tracing_subscriber ::fmt ::layer ( )
. with_file ( true )
. with_line_number ( true ) ;
let fmt_layer = match json_output {
true = > fmt_layer . json ( ) . flatten_event ( true ) . boxed ( ) ,
false = > fmt_layer . boxed ( ) ,
} ;
layers . push ( fmt_layer ) ;
// OpenTelemetry tracing layer
if let Some ( otlp_endpoint ) = otlp_endpoint {
global ::set_text_map_propagator ( TraceContextPropagator ::new ( ) ) ;
let tracer = opentelemetry_otlp ::new_pipeline ( )
. tracing ( )
. with_exporter (
opentelemetry_otlp ::new_exporter ( )
. tonic ( )
. with_endpoint ( otlp_endpoint ) ,
)
. with_trace_config (
trace ::config ( )
. with_resource ( Resource ::new ( vec! [ KeyValue ::new (
" service.name " ,
" text-generation-inference.router " ,
) ] ) )
. with_sampler ( Sampler ::AlwaysOn ) ,
)
. install_batch ( opentelemetry ::runtime ::Tokio ) ;
if let Ok ( tracer ) = tracer {
layers . push ( tracing_opentelemetry ::layer ( ) . with_tracer ( tracer ) . boxed ( ) ) ;
2023-09-27 08:40:18 +00:00
init_tracing_opentelemetry ::init_propagator ( ) . unwrap ( ) ;
2023-02-13 12:02:45 +00:00
} ;
}
// Filter events with LOG_LEVEL
let env_filter =
EnvFilter ::try_from_env ( " LOG_LEVEL " ) . unwrap_or_else ( | _ | EnvFilter ::new ( " info " ) ) ;
tracing_subscriber ::registry ( )
. with ( env_filter )
. with ( layers )
. init ( ) ;
}
2023-04-18 14:16:06 +00:00
/// get model info from the Huggingface Hub
2024-01-10 15:35:54 +00:00
pub async fn get_model_info ( api : & ApiRepo ) -> Option < HubModelInfo > {
let response = api . info_request ( ) . send ( ) . await . ok ( ) ? ;
2023-05-09 11:19:31 +00:00
if response . status ( ) . is_success ( ) {
2023-07-13 16:59:38 +00:00
let hub_model_info : HubModelInfo =
serde_json ::from_str ( & response . text ( ) . await . ok ( ) ? ) . ok ( ) ? ;
if let Some ( sha ) = & hub_model_info . sha {
tracing ::info! (
" Serving revision {sha} of model {} " ,
hub_model_info . model_id
) ;
}
Some ( hub_model_info )
} else {
None
2023-05-09 11:19:31 +00:00
}
2023-04-18 14:16:06 +00:00
}
2023-07-10 12:47:15 +00:00
2024-01-10 15:35:54 +00:00
/// get base tokenizer
pub async fn get_base_tokenizer ( api : & Api , api_repo : & ApiRepo ) -> Option < Tokenizer > {
let config_filename = api_repo . get ( " config.json " ) . await . ok ( ) ? ;
// Open the file in read-only mode with buffer.
let file = File ::open ( config_filename ) . ok ( ) ? ;
let reader = BufReader ::new ( file ) ;
// Read the JSON contents of the file as an instance of `User`.
let config : serde_json ::Value = serde_json ::from_reader ( reader ) . ok ( ) ? ;
if let Some ( serde_json ::Value ::String ( base_model_id ) ) = config . get ( " base_model_name_or_path " ) {
let api_base_repo = api . repo ( Repo ::with_revision (
base_model_id . to_string ( ) ,
RepoType ::Model ,
" main " . to_string ( ) ,
) ) ;
let tokenizer_filename = api_base_repo . get ( " tokenizer.json " ) . await . ok ( ) ? ;
Tokenizer ::from_file ( tokenizer_filename ) . ok ( )
} else {
None
}
}
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
/// get tokenizer_config from the Huggingface Hub
pub async fn get_tokenizer_config ( api_repo : & ApiRepo ) -> Option < HubTokenizerConfig > {
let tokenizer_config_filename = api_repo . get ( " tokenizer_config.json " ) . await . ok ( ) ? ;
// Open the file in read-only mode with buffer.
let file = File ::open ( tokenizer_config_filename ) . ok ( ) ? ;
let reader = BufReader ::new ( file ) ;
// Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
2024-01-26 09:41:39 +00:00
let tokenizer_config : HubTokenizerConfig = serde_json ::from_reader ( reader )
. map_err ( | e | {
tracing ::warn! ( " Unable to parse tokenizer config: {} " , e ) ;
e
} )
. ok ( ) ? ;
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
for message in chat_completion:
print(message)
```
```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
```
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
Some ( tokenizer_config )
}
2023-07-10 12:47:15 +00:00
#[ derive(Debug, Error) ]
enum RouterError {
2023-07-13 12:22:37 +00:00
#[ error( " Argument validation error: {0} " ) ]
ArgumentValidation ( String ) ,
2023-07-10 12:47:15 +00:00
#[ error( " Unable to connect to the Python model shards: {0} " ) ]
Connection ( ClientError ) ,
#[ error( " Unable to clear the Python model shards cache: {0} " ) ]
Cache ( ClientError ) ,
#[ error( " Unable to get the Python model shards info: {0} " ) ]
Info ( ClientError ) ,
#[ error( " Unable to warmup the Python model shards: {0} " ) ]
Warmup ( ClientError ) ,
#[ error( " Tokio runtime failed to start: {0} " ) ]
Tokio ( #[ from ] std ::io ::Error ) ,
#[ error( " Axum webserver failed: {0} " ) ]
Axum ( #[ from ] axum ::BoxError ) ,
}