2024-04-17 08:41:12 +00:00
|
|
|
|
import pytest
|
|
|
|
|
import requests
|
2025-03-10 16:56:19 +00:00
|
|
|
|
from openai import OpenAI
|
2025-03-07 18:45:57 +00:00
|
|
|
|
from huggingface_hub import InferenceClient
|
2024-04-17 08:41:12 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
def flash_llama_completion_handle(launcher):
|
|
|
|
|
with launcher(
|
2024-09-11 16:10:40 +00:00
|
|
|
|
"meta-llama/Meta-Llama-3.1-8B-Instruct",
|
2024-04-17 08:41:12 +00:00
|
|
|
|
) as handle:
|
|
|
|
|
yield handle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
async def flash_llama_completion(flash_llama_completion_handle):
|
|
|
|
|
await flash_llama_completion_handle.health(300)
|
|
|
|
|
return flash_llama_completion_handle.client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
|
|
|
|
|
# method for it. Instead, we use the `requests` library to make the HTTP request directly.
|
|
|
|
|
|
|
|
|
|
|
2024-06-25 14:53:20 +00:00
|
|
|
|
@pytest.mark.release
|
2024-04-17 08:41:12 +00:00
|
|
|
|
def test_flash_llama_completion_single_prompt(
|
|
|
|
|
flash_llama_completion, response_snapshot
|
|
|
|
|
):
|
|
|
|
|
response = requests.post(
|
|
|
|
|
f"{flash_llama_completion.base_url}/v1/completions",
|
|
|
|
|
json={
|
|
|
|
|
"model": "tgi",
|
2024-09-11 16:10:40 +00:00
|
|
|
|
"prompt": "What is Deep Learning?",
|
|
|
|
|
"max_tokens": 10,
|
|
|
|
|
"temperature": 0.0,
|
2024-04-17 08:41:12 +00:00
|
|
|
|
},
|
|
|
|
|
headers=flash_llama_completion.headers,
|
|
|
|
|
stream=False,
|
|
|
|
|
)
|
|
|
|
|
response = response.json()
|
|
|
|
|
assert len(response["choices"]) == 1
|
2024-09-11 16:10:40 +00:00
|
|
|
|
assert (
|
|
|
|
|
response["choices"][0]["text"]
|
|
|
|
|
== " A Beginner’s Guide\nDeep learning is a subset"
|
|
|
|
|
)
|
2024-04-17 08:41:12 +00:00
|
|
|
|
assert response == response_snapshot
|
|
|
|
|
|
|
|
|
|
|
2024-09-19 18:50:37 +00:00
|
|
|
|
@pytest.mark.release
|
|
|
|
|
async def test_flash_llama_completion_stream_usage(
|
|
|
|
|
flash_llama_completion, response_snapshot
|
|
|
|
|
):
|
2025-03-07 18:45:57 +00:00
|
|
|
|
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
stream = client.chat_completion(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[
|
2024-09-19 18:50:37 +00:00
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": "What is Deep Learning?",
|
|
|
|
|
}
|
|
|
|
|
],
|
2025-03-07 18:45:57 +00:00
|
|
|
|
max_tokens=10,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
stream_options={"include_usage": True},
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
2024-09-19 18:50:37 +00:00
|
|
|
|
string = ""
|
|
|
|
|
chunks = []
|
|
|
|
|
had_usage = False
|
2025-03-07 18:45:57 +00:00
|
|
|
|
for chunk in stream:
|
|
|
|
|
# remove "data:"
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
if len(chunk.choices) == 1:
|
|
|
|
|
index = chunk.choices[0].index
|
|
|
|
|
assert index == 0
|
|
|
|
|
string += chunk.choices[0].delta.content
|
|
|
|
|
if chunk.usage:
|
|
|
|
|
assert not had_usage
|
|
|
|
|
had_usage = True
|
2024-09-19 18:50:37 +00:00
|
|
|
|
|
|
|
|
|
assert had_usage
|
|
|
|
|
assert (
|
|
|
|
|
string
|
|
|
|
|
== "**Deep Learning: An Overview**\n=====================================\n\n"
|
|
|
|
|
)
|
|
|
|
|
assert chunks == response_snapshot
|
|
|
|
|
|
2025-03-07 18:45:57 +00:00
|
|
|
|
stream = client.chat_completion(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[
|
2024-09-19 18:50:37 +00:00
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": "What is Deep Learning?",
|
|
|
|
|
}
|
|
|
|
|
],
|
2025-03-07 18:45:57 +00:00
|
|
|
|
max_tokens=10,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
# No usage
|
|
|
|
|
# stream_options={"include_usage": True},
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
2024-09-19 18:50:37 +00:00
|
|
|
|
string = ""
|
|
|
|
|
chunks = []
|
|
|
|
|
had_usage = False
|
2025-03-07 18:45:57 +00:00
|
|
|
|
for chunk in stream:
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
assert chunk.usage is None
|
|
|
|
|
assert len(chunk.choices) == 1
|
|
|
|
|
assert chunk.choices[0].index == 0
|
|
|
|
|
string += chunk.choices[0].delta.content
|
2024-09-19 18:50:37 +00:00
|
|
|
|
assert (
|
|
|
|
|
string
|
|
|
|
|
== "**Deep Learning: An Overview**\n=====================================\n\n"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-06-25 14:53:20 +00:00
|
|
|
|
@pytest.mark.release
|
2024-04-17 08:41:12 +00:00
|
|
|
|
def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
|
|
|
|
|
response = requests.post(
|
|
|
|
|
f"{flash_llama_completion.base_url}/v1/completions",
|
|
|
|
|
json={
|
|
|
|
|
"model": "tgi",
|
2024-09-11 16:10:40 +00:00
|
|
|
|
"prompt": [
|
|
|
|
|
"What is Deep Learning?",
|
|
|
|
|
"Is water wet?",
|
|
|
|
|
"What is the capital of France?",
|
|
|
|
|
"def mai",
|
|
|
|
|
],
|
2024-04-17 08:41:12 +00:00
|
|
|
|
"max_tokens": 10,
|
|
|
|
|
"seed": 0,
|
2024-09-11 16:10:40 +00:00
|
|
|
|
"temperature": 0.0,
|
2024-04-17 08:41:12 +00:00
|
|
|
|
},
|
|
|
|
|
headers=flash_llama_completion.headers,
|
|
|
|
|
stream=False,
|
|
|
|
|
)
|
|
|
|
|
response = response.json()
|
|
|
|
|
assert len(response["choices"]) == 4
|
|
|
|
|
|
2024-09-11 16:10:40 +00:00
|
|
|
|
all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
|
2024-04-17 08:41:12 +00:00
|
|
|
|
all_indexes.sort()
|
2024-09-11 16:10:40 +00:00
|
|
|
|
all_indices, all_strings = zip(*all_indexes)
|
|
|
|
|
assert list(all_indices) == [0, 1, 2, 3]
|
|
|
|
|
assert list(all_strings) == [
|
|
|
|
|
" A Beginner’s Guide\nDeep learning is a subset",
|
|
|
|
|
" This is a question that has puzzled many people for",
|
|
|
|
|
" Paris\nWhat is the capital of France?\nThe",
|
|
|
|
|
'usculas_minusculas(s):\n """\n',
|
|
|
|
|
]
|
2024-04-17 08:41:12 +00:00
|
|
|
|
|
|
|
|
|
assert response == response_snapshot
|
|
|
|
|
|
|
|
|
|
|
2024-06-25 14:53:20 +00:00
|
|
|
|
@pytest.mark.release
|
2024-04-17 08:41:12 +00:00
|
|
|
|
async def test_flash_llama_completion_many_prompts_stream(
|
|
|
|
|
flash_llama_completion, response_snapshot
|
|
|
|
|
):
|
2025-03-10 16:56:19 +00:00
|
|
|
|
client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
stream = client.completions.create(
|
|
|
|
|
model="tgi",
|
|
|
|
|
prompt=[
|
2024-09-11 16:10:40 +00:00
|
|
|
|
"What is Deep Learning?",
|
2024-04-17 08:41:12 +00:00
|
|
|
|
"Is water wet?",
|
|
|
|
|
"What is the capital of France?",
|
|
|
|
|
"def mai",
|
|
|
|
|
],
|
2025-03-10 16:56:19 +00:00
|
|
|
|
max_tokens=10,
|
|
|
|
|
seed=0,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
stream=True,
|
|
|
|
|
)
|
2024-04-17 08:41:12 +00:00
|
|
|
|
|
2024-09-11 16:10:40 +00:00
|
|
|
|
strings = [""] * 4
|
2025-03-10 16:56:19 +00:00
|
|
|
|
chunks = []
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
index = chunk.choices[0].index
|
|
|
|
|
assert 0 <= index <= 4
|
|
|
|
|
strings[index] += chunk.choices[0].text
|
|
|
|
|
|
2024-09-11 16:10:40 +00:00
|
|
|
|
assert list(strings) == [
|
|
|
|
|
" A Beginner’s Guide\nDeep learning is a subset",
|
|
|
|
|
" This is a question that has puzzled many people for",
|
|
|
|
|
" Paris\nWhat is the capital of France?\nThe",
|
|
|
|
|
'usculas_minusculas(s):\n """\n',
|
|
|
|
|
]
|
2024-04-17 08:41:12 +00:00
|
|
|
|
assert chunks == response_snapshot
|
2025-03-10 16:56:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.release
|
|
|
|
|
async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
|
|
|
|
|
client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
|
|
|
|
|
stream = client.chat.completions.create(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[{"role": "user", "content": "Say 'OK!'"}],
|
|
|
|
|
stream=True,
|
|
|
|
|
max_tokens=10,
|
|
|
|
|
seed=42,
|
|
|
|
|
stream_options={"include_usage": True},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
for chunk in chunks[:-1]:
|
|
|
|
|
assert chunk.usage is None
|
|
|
|
|
for chunk in chunks[-1:]:
|
|
|
|
|
assert chunk.usage is not None
|
|
|
|
|
|
|
|
|
|
assert chunks == response_snapshot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.release
|
|
|
|
|
async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
|
|
|
|
|
client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
|
|
|
|
|
stream = client.chat.completions.create(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[{"role": "user", "content": "Say 'OK!'"}],
|
|
|
|
|
stream=True,
|
|
|
|
|
max_tokens=10,
|
|
|
|
|
seed=42,
|
|
|
|
|
stream_options={"include_usage": False},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
assert chunk.usage is None
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
assert chunks == response_snapshot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.release
|
|
|
|
|
async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
|
|
|
|
|
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
stream = client.chat_completion(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[{"role": "user", "content": "Say 'OK!'"}],
|
|
|
|
|
stream=True,
|
|
|
|
|
max_tokens=10,
|
|
|
|
|
seed=42,
|
|
|
|
|
stream_options={"include_usage": True},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
for chunk in chunks[:-1]:
|
|
|
|
|
assert chunk.usage is None
|
|
|
|
|
for chunk in chunks[-1:]:
|
|
|
|
|
assert chunk.usage is not None
|
|
|
|
|
|
|
|
|
|
assert chunks == response_snapshot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.release
|
|
|
|
|
async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
|
|
|
|
|
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
|
|
|
|
|
stream = client.chat_completion(
|
|
|
|
|
model="tgi",
|
|
|
|
|
messages=[{"role": "user", "content": "Say 'OK!'"}],
|
|
|
|
|
stream=True,
|
|
|
|
|
max_tokens=10,
|
|
|
|
|
seed=42,
|
|
|
|
|
stream_options={"include_usage": False},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
assert chunk.usage is None
|
|
|
|
|
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
assert chunks == response_snapshot
|