add parallelization

This commit is contained in:
OlivierDehaene 2023-05-16 21:14:29 +02:00
parent 8ddbdea45b
commit f08a1a50b7
16 changed files with 173 additions and 193 deletions

View File

@ -213,12 +213,13 @@ jobs:
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
- name: Install
run: |
pip install pytest-xdist
make install-integration-tests
- name: Run tests
run: |
export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
pytest -s -vv integration-tests
pytest -s -vv -n 2 --dist loadfile integration-tests
stop-runner:
name: Stop self-hosted EC2 runner

View File

@ -1,3 +1,4 @@
import sys
import subprocess
import contextlib
import pytest
@ -7,6 +8,7 @@ import docker
import json
import math
import time
import random
from docker.errors import NotFound
from typing import Optional, List, Dict
@ -205,10 +207,12 @@ def launcher(event_loop):
def local_launcher(
model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
):
port = 9999
master_port = 19999
port = random.randint(8000, 10_000)
master_port = random.randint(10_000, 20_000)
shard_uds_path = f"/tmp/{model_id.replace('/', '--')}-server"
shard_uds_path = (
f"/tmp/tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}-server"
)
args = [
"text-generation-launcher",
@ -236,7 +240,7 @@ def launcher(event_loop):
process.wait(60)
launcher_output = process.stdout.read().decode("utf-8")
print(launcher_output)
print(launcher_output, file=sys.stderr)
process.stdout.close()
process.stderr.close()
@ -245,7 +249,7 @@ def launcher(event_loop):
def docker_launcher(
model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
):
port = 9999
port = random.randint(8000, 10_000)
args = ["--model-id", model_id, "--env"]
@ -298,7 +302,7 @@ def launcher(event_loop):
pass
container_output = container.logs().decode("utf-8")
print(container_output)
print(container_output, file=sys.stderr)
container.remove()

View File

@ -1,92 +1,4 @@
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.5546875,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 363,
"logprob": -1.5322266,
"special": false,
"text": " for"
},
{
"id": 847,
"logprob": -2.5585938,
"special": false,
"text": " /"
},
{
"id": 2754,
"logprob": -2.265625,
"special": false,
"text": "api"
},
{
"id": 29914,
"logprob": -0.034088135,
"special": false,
"text": "/"
},
{
"id": 29894,
"logprob": -0.96240234,
"special": false,
"text": "v"
},
{
"id": 29896,
"logprob": -0.36816406,
"special": false,
"text": "1"
},
{
"id": 29914,
"logprob": -0.013191223,
"special": false,
"text": "/"
},
{
"id": 16418,
"logprob": -3.15625,
"special": false,
"text": "projects"
},
{
"id": 29914,
"logprob": -0.43774414,
"special": false,
"text": "/"
},
{
"id": 29896,
"logprob": -1.9443359,
"special": false,
"text": "1"
}
]
},
"generated_text": "for /api/v1/projects/1"
},
{
"details": {
"best_of_sequences": null,
@ -263,6 +175,94 @@
},
"generated_text": "for /api/v1/projects/1"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.6875,
"text": "Test"
},
{
"id": 2009,
"logprob": -11.5546875,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 363,
"logprob": -1.5322266,
"special": false,
"text": " for"
},
{
"id": 847,
"logprob": -2.5585938,
"special": false,
"text": " /"
},
{
"id": 2754,
"logprob": -2.265625,
"special": false,
"text": "api"
},
{
"id": 29914,
"logprob": -0.034088135,
"special": false,
"text": "/"
},
{
"id": 29894,
"logprob": -0.96240234,
"special": false,
"text": "v"
},
{
"id": 29896,
"logprob": -0.36816406,
"special": false,
"text": "1"
},
{
"id": 29914,
"logprob": -0.013191223,
"special": false,
"text": "/"
},
{
"id": 16418,
"logprob": -3.15625,
"special": false,
"text": "projects"
},
{
"id": 29914,
"logprob": -0.43774414,
"special": false,
"text": "/"
},
{
"id": 29896,
"logprob": -1.9443359,
"special": false,
"text": "1"
}
]
},
"generated_text": "for /api/v1/projects/1"
},
{
"details": {
"best_of_sequences": null,

View File

@ -1,58 +1,4 @@
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 6,
"prefill": [
{
"id": 0,
"logprob": null,
"text": "<pad>"
}
],
"seed": null,
"tokens": [
{
"id": 259,
"logprob": -1.3789062,
"special": false,
"text": ""
},
{
"id": 39261,
"logprob": -0.36279297,
"special": false,
"text": "Because"
},
{
"id": 609,
"logprob": -1.0966797,
"special": false,
"text": " it"
},
{
"id": 339,
"logprob": -0.8276367,
"special": false,
"text": " is"
},
{
"id": 16017,
"logprob": -1.6845703,
"special": false,
"text": " blue"
},
{
"id": 1,
"logprob": -0.72753906,
"special": true,
"text": "</s>"
}
]
},
"generated_text": "Because it is blue"
},
{
"details": {
"best_of_sequences": null,
@ -161,6 +107,60 @@
},
"generated_text": "Because it is blue"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 6,
"prefill": [
{
"id": 0,
"logprob": null,
"text": "<pad>"
}
],
"seed": null,
"tokens": [
{
"id": 259,
"logprob": -1.3789062,
"special": false,
"text": " "
},
{
"id": 39261,
"logprob": -0.36279297,
"special": false,
"text": "Because"
},
{
"id": 609,
"logprob": -1.0966797,
"special": false,
"text": " it"
},
{
"id": 339,
"logprob": -0.8276367,
"special": false,
"text": " is"
},
{
"id": 16017,
"logprob": -1.6845703,
"special": false,
"text": " blue"
},
{
"id": 1,
"logprob": -0.72753906,
"special": true,
"text": "</s>"
}
]
},
"generated_text": "Because it is blue"
},
{
"details": {
"best_of_sequences": null,

View File

@ -146,7 +146,7 @@ fn main() -> Result<(), std::io::Error> {
sha: None,
pipeline_tag: None,
},
false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or({
false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or_else(|| {
tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
HubModelInfo { model_id: tokenizer_name.to_string(), sha: None, pipeline_tag: None }
}),

View File

@ -56,7 +56,7 @@ class BLOOM(CausalLM):
quantize: Optional[str] = None,
):
super(BLOOM, self).__init__(
model_id=model_id, revision=revision, quantize=quantize, decode_buffer=1
model_id=model_id, revision=revision, quantize=quantize
)
@property
@ -111,7 +111,6 @@ class BLOOMSharded(BLOOM):
requires_padding=True,
dtype=dtype,
device=device,
decode_buffer=1,
rank=rank,
world_size=world_size,
)

View File

@ -81,8 +81,6 @@ class CausalLMBatch(Batch):
for i, r in enumerate(pb.requests):
requests_idx_mapping[r.id] = i
inputs.append(r.inputs)
# offsets.append(None)
# token_offsets.append(None)
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer
@ -102,7 +100,7 @@ class CausalLMBatch(Batch):
truncation=True,
max_length=max_truncation,
).to(device)
for i, r in enumerate(pb.requests):
for _ in pb.requests:
input_len = tokenized_inputs["input_ids"].shape[1]
offsets.append(0)
token_offsets.append(input_len)
@ -452,7 +450,6 @@ class CausalLM(Model):
model_id: str,
revision: Optional[str] = None,
quantize: Optional[str] = None,
decode_buffer: int = 4,
):
if torch.cuda.is_available():
device = torch.device("cuda")
@ -486,7 +483,6 @@ class CausalLM(Model):
requires_padding=True,
dtype=dtype,
device=device,
decode_buffer=decode_buffer,
)
@property

View File

@ -108,8 +108,8 @@ class FlashCausalLMBatch(Batch):
max_seqlen = max(max_seqlen, input_length)
input_lengths.append(input_length)
offsets.append(None)
token_offsets.append(None)
offsets.append(0)
token_offsets.append(input_length)
all_input_ids.append(tokenized_input)
@ -394,7 +394,6 @@ class FlashCausalLM(Model):
model_id: str,
revision: Optional[str] = None,
quantize: Optional[str] = None,
decode_buffer: int = 4,
):
if torch.cuda.is_available():
device = torch.device("cuda")
@ -410,7 +409,7 @@ class FlashCausalLM(Model):
revision=revision,
torch_dtype=dtype,
load_in_8bit=quantize == "bitsandbytes",
)
).to(device)
super(FlashCausalLM, self).__init__(
model=model,
@ -418,7 +417,6 @@ class FlashCausalLM(Model):
requires_padding=False,
dtype=dtype,
device=device,
decode_buffer=decode_buffer,
)
@property

View File

@ -66,7 +66,7 @@ class FlashLlama(FlashCausalLM):
self.load_weights(model, filenames, quantize, device, dtype)
super(FlashCausalLM, self).__init__(
model=model,
model=model.to(device),
tokenizer=tokenizer,
requires_padding=False,
dtype=dtype,
@ -191,7 +191,7 @@ class FlashLlamaSharded(FlashLlama):
)
torch.distributed.barrier(group=self.process_group)
super(FlashCausalLM, self).__init__(
model=model,
model=model.to(device),
tokenizer=tokenizer,
requires_padding=False,
dtype=dtype,

View File

@ -75,7 +75,7 @@ class FlashNeoXSharded(FlashNeoX):
)
torch.distributed.barrier(group=self.process_group)
super(FlashCausalLM, self).__init__(
model=model,
model=model.to(device),
tokenizer=tokenizer,
requires_padding=False,
dtype=dtype,

View File

@ -69,12 +69,11 @@ class FlashSantacoder(FlashCausalLM):
)
super(FlashCausalLM, self).__init__(
model=model,
model=model.to(device),
tokenizer=tokenizer,
requires_padding=False,
dtype=dtype,
device=device,
decode_buffer=1,
)
@staticmethod
@ -215,14 +214,13 @@ class FlashSantacoderSharded(FlashSantacoder):
)
torch.distributed.barrier(group=self.process_group)
super(FlashCausalLM, self).__init__(
model=model,
model=model.to(device),
tokenizer=tokenizer,
requires_padding=False,
dtype=dtype,
device=device,
rank=rank,
world_size=world_size,
decode_buffer=1,
)
@staticmethod

View File

@ -18,20 +18,15 @@ class Model(ABC):
requires_padding: bool,
dtype: torch.dtype,
device: torch.device,
decode_buffer: int = 4,
rank: int = 0,
world_size: int = 1,
):
if decode_buffer < 1:
raise ValueError("decode_buffer must be >= 1")
self.model = model.eval().to(device)
self.model = model.eval()
self.tokenizer = tokenizer
self.all_special_ids = set(tokenizer.all_special_ids)
self.requires_padding = requires_padding
self.dtype = dtype
self.device = device
self.decode_buffer = decode_buffer
self.rank = rank
self.world_size = world_size
self.check_initialized()
@ -61,12 +56,6 @@ class Model(ABC):
) -> Tuple[str, int, int]:
"""Hack to hopefully support generate_stream for the maximum number of tokenizers"""
# Compatibility layer for old None values.
if prefix_offset is None:
prefix_offset = 0
if read_offset is None:
read_offset = 0
# The prefix text is necessary only to defeat cleanup algorithms in the decode
# which decide to add a space or not depending on the surrounding ids.
prefix_text = self.tokenizer.decode(

View File

@ -52,7 +52,7 @@ class SantaCoder(CausalLM):
torch_dtype=dtype,
load_in_8bit=quantize == "bitsandbytes",
trust_remote_code=True, # required
)
).to(device)
super(CausalLM, self).__init__(
model=model,
@ -60,7 +60,6 @@ class SantaCoder(CausalLM):
requires_padding=True,
dtype=dtype,
device=device,
decode_buffer=1,
)
def decode(self, generated_ids: List[int]) -> str:

View File

@ -91,8 +91,6 @@ class Seq2SeqLMBatch(Batch):
inputs.append(r.inputs)
requests_idx_mapping[r.id] = i
decoder_input_lengths.append(1)
# offsets.append(None)
# token_offsets.append(None)
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer
@ -123,7 +121,7 @@ class Seq2SeqLMBatch(Batch):
.repeat(len(pb.requests))
.view(-1, 1)
)
for i, r in enumerate(pb.requests):
for _ in pb.requests:
offsets.append(0)
token_offsets.append(1)
all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
@ -505,7 +503,6 @@ class Seq2SeqLM(Model):
model_id: str,
revision: Optional[str] = None,
quantize: Optional[str] = None,
decode_buffer: int = 4,
):
if torch.cuda.is_available():
device = torch.device("cuda")
@ -535,7 +532,6 @@ class Seq2SeqLM(Model):
requires_padding=True,
dtype=dtype,
device=device,
decode_buffer=decode_buffer,
)
@property