(ADD): client benchmarking utility & omegaconf

2025-09-11 04:14:52 +00:00 · 2023-10-11 15:09:37 +00:00 · 2023-10-11 15:09:37 +00:00 · 1455830909
commit 1455830909
parent 20ee71dcf5
3 changed files with 117 additions and 0 deletions
--- a/clients/python/utils/client.py
+++ b/clients/python/utils/client.py
@ -0,0 +1,90 @@
 import asyncio
 import logging
 import time
 from text_generation import AsyncClient
 from typing import List, Dict
 from torch import IntTensor
 from transformers import AutoTokenizer
 import numpy as np
 from hydra import (
    compose,
    initialize
 )
 from omegaconf import DictConfig, OmegaConf
 log = logging.getLogger(__name__)
 # intialize Hydra subsystem
 initialize(version_base=None, config_path="conf")
 cfg: DictConfig = compose("config.yaml")
 print(f"Experiment configuration:\n{OmegaConf.to_yaml(cfg)}")
 # serving system deployment
 max_concurrent_requests: int = cfg.deployment.max_concurrent_requests
 endpoint: str = ""
 if cfg.deployment.local: endpoint = f"{cfg.deployment.addr}:{ cfg.deployment.port}"
 if not cfg.deployment.local: endpoint = cfg.deployment.endpoint
 client = AsyncClient(endpoint)
 # uncomment for using natural language prompts
 # prompt: str = "Hello" * 100
 # output_lenght (decoding length)
 max_new_tokens: int = cfg.GenerationConfig.max_new_tokens
 # generation_strategy
 repetition_penalty: float = cfg.GenerationConfig.repetition_penalty
 do_sample: bool = cfg.GenerationConfig.do_sample
 # generation finish reason
 stop_sequences: List[str] = [stop for stop in cfg.GenerationConfig.stop_sequences]
 tokenizer = AutoTokenizer.from_pretrained(cfg.macros.tokenizer_name)
 # "id": 21820 = "Hello"
 dummy_input: IntTensor = IntTensor([[21820]])
 # inverse tokenization (sanity check)
 # token: str = tokenizer.decode(dummy_input[0], skip_special_tokens=True)
 sequence_length: int = cfg.GenerationConfig.sequence_length
 multiple_dummy_repeat = dummy_input.repeat(1, sequence_length)
 prompt: str = tokenizer.decode(multiple_dummy_repeat[0], skip_special_tokens=True)
 assert np.shape(multiple_dummy_repeat)[-1] == sequence_length
 generate_kwargs: Dict = {"do_sample": do_sample,
                         "max_new_tokens": max_new_tokens,
                        "repetition_penalty": repetition_penalty,
                        "stop_sequences": stop_sequences,
                        "decoder_input_details": True,
                        }
 prompts: List = [prompt] * max_concurrent_requests
 # create many coroutines
 coros = [client.generate(prompt, **generate_kwargs) for prompt in prompts]
 async def batch():
    return await asyncio.gather(*coros)
 st: float = time.perf_counter_ns()
 results = asyncio.run(batch())
 et = (time.perf_counter_ns() - st) * 1e-9
 print(f"Serving elapsed time: {et:0.4f} seconds")
 # check the last response
 print(results[-1].details)
 total_input_sequence_tokens: int = 0
 total_decoded_tokens: int = 0
 for prompt, response in zip(prompts,results):
    # uncomment for see generations
    # print(prompt + response.generated_text)
    # assert np.shape(tokenizer(response.generated_text, return_tensors="pt").input_ids)[-1] -1 == max_new_tokens
    # assert response.details.generated_tokens == max_new_tokens
    total_input_sequence_tokens += len(response.details.prefill)
    total_decoded_tokens += response.details.generated_tokens
 assert total_input_sequence_tokens == max_concurrent_requests * sequence_length
 # stats
 print(f"Serving elapsed time: {et:0.4f} seconds")
 print(f"Total requests: {max_concurrent_requests}")
 print(f"Total sequences tokens: {total_input_sequence_tokens}")
 print(f"Sequence length: {sequence_length}")
 print(f"Total decoded tokens: {total_decoded_tokens}")
 print(f"Throughput: {total_decoded_tokens/et} tokens/sec")
 print(f"Total processed tokens: {total_decoded_tokens + total_input_sequence_tokens}")
--- a/clients/python/utils/conf/config.yaml
+++ b/clients/python/utils/conf/config.yaml
@ -0,0 +1,17 @@
 deployment:
  addr: http://127.0.0.1
  port: 8080
  max_concurrent_requests: 64
  local: True
  endpoint: http://guanaco-65b-merged.inference.takomo.internal 
 macros:
  tokenizer_name: bigscience/bloom-560m
 GenerationConfig:
  max_new_tokens: 100
  repetition_penalty: 1.5
  do_sample: True
  stop_sequences: 
    - "length"
  sequence_length: 100
--- a/server/text_generation_server/debug_backend.py
+++ b/server/text_generation_server/debug_backend.py
@ -0,0 +1,10 @@
 # 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve distilgpt2
 import subprocess
 import os
 from pathlib import Path
 wd_dir: str = Path(__file__).parent.absolute()
 cli_path: str = os.path.join(wd_dir, "cli.py")
 os.environ["SAFETENSORS_FAST_GPU"] = "1"
 command: str = f"python -m torch.distributed.run --nproc_per_node=1 {cli_path} serve bigscience/bloom-560m"
 subprocess.run(command.split())