mirror of
				https://github.com/huggingface/text-generation-inference.git
				synced 2025-10-21 21:05:23 +00:00 
			
		
		
		
	
		
			
	
	
		
			95 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			95 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
|  | import { check } from 'k6'; | ||
|  | import { scenario } from 'k6/execution'; | ||
|  | import http from 'k6/http'; | ||
|  | import { Trend, Counter } from 'k6/metrics'; | ||
|  | 
 | ||
|  | const host = __ENV.HOST; | ||
|  | const model_id = __ENV.MODEL_ID; | ||
|  | const timePerToken = new Trend('time_per_token', true); | ||
|  | const tokens = new Counter('tokens'); | ||
|  | const new_tokens = new Counter('new_tokens'); | ||
|  | const input_tokens = new Counter('input_tokens'); | ||
|  | const max_new_tokens = 50; | ||
|  | 
 | ||
|  | // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
 | ||
|  | const shareGPT = JSON.parse(open("small.json")) | ||
|  | 
 | ||
|  | 
 | ||
|  | export function get_options() { | ||
|  |     return { | ||
|  |         thresholds: { | ||
|  |             http_req_failed: ['rate==0'], | ||
|  |             // time_per_token: [{
 | ||
|  |             //     threshold: `p(50)<${5 * reference_latency_ms}`,
 | ||
|  |             //     abortOnFail: true,
 | ||
|  |             //     delayAbortEval: '10s'
 | ||
|  |             // }],
 | ||
|  |         }, | ||
|  |         scenarios: { | ||
|  |             // single_user: {
 | ||
|  |             //     executor: 'constant-arrival-rate',
 | ||
|  |             //     duration: '60s',
 | ||
|  |             //     preAllocatedVUs: 1,
 | ||
|  |             //     rate: 20,
 | ||
|  |             //     timeUnit: '1s',
 | ||
|  |             // },
 | ||
|  |             // load_test: {
 | ||
|  |             //     executor: 'constant-arrival-rate',
 | ||
|  |             //     duration: '60s',
 | ||
|  |             //     preAllocatedVUs: 100,
 | ||
|  |             //     rate: 1,
 | ||
|  |             //     timeUnit: '1s',
 | ||
|  |             // },
 | ||
|  |             // breakpoint: {
 | ||
|  |             //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
 | ||
|  |             //     preAllocatedVUs: 300,
 | ||
|  |             //     stages: [
 | ||
|  |             //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
 | ||
|  |             //     ],
 | ||
|  |             // },
 | ||
|  |             throughput: { | ||
|  |                 executor: 'shared-iterations', | ||
|  |                 vus: 100, | ||
|  |                 iterations: 200, | ||
|  |                 maxDuration: '40s', | ||
|  |             }, | ||
|  |         }, | ||
|  |     }; | ||
|  | } | ||
|  | 
 | ||
|  | function generate_payload(gpt, max_new_tokens) { | ||
|  |     const input = gpt["conversations"][0]["value"]; | ||
|  |     return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } | ||
|  | } | ||
|  | 
 | ||
|  | export const options = get_options(); | ||
|  | 
 | ||
|  | export default function run() { | ||
|  |     const headers = { 'Content-Type': 'application/json' }; | ||
|  |     const query = shareGPT[scenario.iterationInTest % shareGPT.length]; | ||
|  |     const payload = JSON.stringify(generate_payload(query, max_new_tokens)); | ||
|  |     const res = http.post(`http://${host}/v1/chat/completions`, payload, { | ||
|  |         headers, | ||
|  |     }); | ||
|  |     if (res.status >= 400 && res.status < 500) { | ||
|  |         return; | ||
|  |     } | ||
|  | 
 | ||
|  | 
 | ||
|  |     check(res, { | ||
|  |         'Post status is 200': (res) => res.status === 200, | ||
|  |     }); | ||
|  |     const duration = res.timings.duration; | ||
|  | 
 | ||
|  |     if (res.status === 200) { | ||
|  |         const body = res.json(); | ||
|  |         const completion_tokens = body.usage.completion_tokens; | ||
|  |         const latency_ms_per_token = duration / completion_tokens; | ||
|  |         timePerToken.add(latency_ms_per_token); | ||
|  |         const prompt_tokens = body.usage.prompt_tokens; | ||
|  |         input_tokens.add(prompt_tokens); | ||
|  |         new_tokens.add(completion_tokens); | ||
|  |         tokens.add(completion_tokens + prompt_tokens); | ||
|  |     } | ||
|  | } |