text-generation-inference/load_tests/benchmarks/templates/main.js.j2

import {check, fail} from 'k6';
import sse from "k6/x/sse"
import {scenario} from 'k6/execution';
import http from 'k6/http';
import {Trend, Counter} from 'k6/metrics';

const host = "127.0.0.1:8080";
const model_id = "Qwen/Qwen2-72B";

const endToEndLatency = new Trend('end_to_end_latency', true);
const requestThroughput = new Counter('request_throughput');
const tokenThroughput = new Counter('tokens_throughput');

const timeToFirstToken = new Trend('time_to_first_token', true);
const interTokenLatency = new Trend('inter_token_latency', true); // is microseconds

const tokensReceived = new Trend('tokens_received');

const max_new_tokens = {{ max_new_tokens }};

const shareGPT = JSON.parse(open("{{ cwd }}/{{ input_filename }}"))

export function handleSummary(data) {
    return {
        'summary.json': JSON.stringify(data),
    };
}

function generate_payload(gpt, max_new_tokens) {
    let input = gpt["message"];
    return {
        "messages": [{"role": "user", "content": input}],
        "temperature": 0,
        "model": `${model_id}`,
        "max_tokens": max_new_tokens,
        "stream": true
    };
}

export const options = get_options();

export default function run() {
    const headers = {'Content-Type': 'application/json'};
    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
    const url = `http://${host}/v1/chat/completions`;
    const params = {
        method: 'POST',
        body: payload,
        headers
    }

    const startTime = Date.now();
    let firstTokenTime = null;
    let lastTokenTime = null;
    let tokensCount = 0;
    let response = ""

    const res = sse.open(url, params, function (client) {
        client.on('event', function (event) {
            if (parseInt(event.id) === 4) {
                client.close()
            }
            if (event.data.includes("[DONE]") || event.data === "") {
                return
            }
            try {
                const data = JSON.parse(event.data);
                if (!'choices' in data) {
                    fail('http_200')
                    return;
                }
                const content = data['choices'][0]['delta']['content']
                if (content !== undefined) {
                    response += data['choices'][0]['delta']['content']
                    tokensCount += 1;
                }

                // Measure time to first token
                if (!firstTokenTime) {
                    firstTokenTime = Date.now();
                    timeToFirstToken.add(firstTokenTime - startTime);
                }

                // Measure inter-token latency
                const currentTime = Date.now();
                if (lastTokenTime) {
                    interTokenLatency.add((currentTime - lastTokenTime) * 1000.);
                }
                lastTokenTime = currentTime;

                if ('finish_reason' in data['choices'][0]) {
                    if (data['choices'][0]['finish_reason'] != null) {
                        const endTime = Date.now();
                        const deltaMs = endTime - startTime;
                        endToEndLatency.add(deltaMs)
                        requestThroughput.add(1);
                        tokenThroughput.add(tokensCount);
                        tokensReceived.add(tokensCount);
                    }
                }
            } catch (e) {
                // catch any errors that occur during the event processing
                // increase the fail count of the 'http_200' check
                check(true, {
                    'http_200': (val) => false,
                })
                fail('http_200')
            }
        })

        client.on('error', function (e) {
            console.log('An unexpected error occurred: ', e.error())
        })
    })

    if (tokensCount === 0) {
        // something went wrong with generation
        fail('http_200')
    }

    if (res.status >= 400 && res.status < 500) {
        return;
    }

    check(res, {
        'http_200': (res) => res.status === 200,
    });

}