diff --git a/clients/python/utils/client.py b/clients/python/utils/client.py new file mode 100644 index 00000000..a1d1ad5c --- /dev/null +++ b/clients/python/utils/client.py @@ -0,0 +1,90 @@ +import asyncio +import logging +import time +from text_generation import AsyncClient + +from typing import List, Dict +from torch import IntTensor +from transformers import AutoTokenizer +import numpy as np + +from hydra import ( + compose, + initialize +) +from omegaconf import DictConfig, OmegaConf + + +log = logging.getLogger(__name__) +# intialize Hydra subsystem +initialize(version_base=None, config_path="conf") +cfg: DictConfig = compose("config.yaml") +print(f"Experiment configuration:\n{OmegaConf.to_yaml(cfg)}") + +# serving system deployment +max_concurrent_requests: int = cfg.deployment.max_concurrent_requests +endpoint: str = "" +if cfg.deployment.local: endpoint = f"{cfg.deployment.addr}:{ cfg.deployment.port}" +if not cfg.deployment.local: endpoint = cfg.deployment.endpoint +client = AsyncClient(endpoint) + +# uncomment for using natural language prompts +# prompt: str = "Hello" * 100 +# output_lenght (decoding length) +max_new_tokens: int = cfg.GenerationConfig.max_new_tokens +# generation_strategy +repetition_penalty: float = cfg.GenerationConfig.repetition_penalty +do_sample: bool = cfg.GenerationConfig.do_sample +# generation finish reason +stop_sequences: List[str] = [stop for stop in cfg.GenerationConfig.stop_sequences] +tokenizer = AutoTokenizer.from_pretrained(cfg.macros.tokenizer_name) +# "id": 21820 = "Hello" +dummy_input: IntTensor = IntTensor([[21820]]) +# inverse tokenization (sanity check) +# token: str = tokenizer.decode(dummy_input[0], skip_special_tokens=True) +sequence_length: int = cfg.GenerationConfig.sequence_length +multiple_dummy_repeat = dummy_input.repeat(1, sequence_length) +prompt: str = tokenizer.decode(multiple_dummy_repeat[0], skip_special_tokens=True) +assert np.shape(multiple_dummy_repeat)[-1] == sequence_length + +generate_kwargs: Dict = {"do_sample": do_sample, + "max_new_tokens": max_new_tokens, + "repetition_penalty": repetition_penalty, + "stop_sequences": stop_sequences, + "decoder_input_details": True, + } + +prompts: List = [prompt] * max_concurrent_requests +# create many coroutines +coros = [client.generate(prompt, **generate_kwargs) for prompt in prompts] + +async def batch(): + return await asyncio.gather(*coros) + +st: float = time.perf_counter_ns() +results = asyncio.run(batch()) +et = (time.perf_counter_ns() - st) * 1e-9 +print(f"Serving elapsed time: {et:0.4f} seconds") +# check the last response +print(results[-1].details) + +total_input_sequence_tokens: int = 0 +total_decoded_tokens: int = 0 + +for prompt, response in zip(prompts,results): + # uncomment for see generations + # print(prompt + response.generated_text) + # assert np.shape(tokenizer(response.generated_text, return_tensors="pt").input_ids)[-1] -1 == max_new_tokens + # assert response.details.generated_tokens == max_new_tokens + total_input_sequence_tokens += len(response.details.prefill) + total_decoded_tokens += response.details.generated_tokens +assert total_input_sequence_tokens == max_concurrent_requests * sequence_length + +# stats +print(f"Serving elapsed time: {et:0.4f} seconds") +print(f"Total requests: {max_concurrent_requests}") +print(f"Total sequences tokens: {total_input_sequence_tokens}") +print(f"Sequence length: {sequence_length}") +print(f"Total decoded tokens: {total_decoded_tokens}") +print(f"Throughput: {total_decoded_tokens/et} tokens/sec") +print(f"Total processed tokens: {total_decoded_tokens + total_input_sequence_tokens}") \ No newline at end of file diff --git a/clients/python/utils/conf/config.yaml b/clients/python/utils/conf/config.yaml new file mode 100644 index 00000000..fdb64f76 --- /dev/null +++ b/clients/python/utils/conf/config.yaml @@ -0,0 +1,17 @@ +deployment: + addr: http://127.0.0.1 + port: 8080 + max_concurrent_requests: 64 + local: True + endpoint: http://guanaco-65b-merged.inference.takomo.internal + +macros: + tokenizer_name: bigscience/bloom-560m + +GenerationConfig: + max_new_tokens: 100 + repetition_penalty: 1.5 + do_sample: True + stop_sequences: + - "length" + sequence_length: 100 \ No newline at end of file diff --git a/server/text_generation_server/debug_backend.py b/server/text_generation_server/debug_backend.py new file mode 100644 index 00000000..322cf3ee --- /dev/null +++ b/server/text_generation_server/debug_backend.py @@ -0,0 +1,10 @@ +# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve distilgpt2 +import subprocess +import os +from pathlib import Path + +wd_dir: str = Path(__file__).parent.absolute() +cli_path: str = os.path.join(wd_dir, "cli.py") +os.environ["SAFETENSORS_FAST_GPU"] = "1" +command: str = f"python -m torch.distributed.run --nproc_per_node=1 {cli_path} serve bigscience/bloom-560m" +subprocess.run(command.split()) \ No newline at end of file