Updated.

2025-09-11 04:14:52 +00:00 · 2023-12-06 16:41:04 +00:00 · 2023-12-06 16:41:04 +00:00 · 3a79fbc63e
commit 3a79fbc63e
parent d2b42f6883
4 changed files with 27 additions and 10 deletions
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -7,7 +7,9 @@ const seed = 0;

 const host = __ENV.HOST || '127.0.0.1:8000';
 const timePerToken = new Trend('time_per_token', true);
-const throughput = new Counter('tokens_per_s');
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');

 randomSeed(seed);
 // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
@ -19,7 +21,7 @@ export function get_options(reference_latency_ms){
        thresholds: {
            http_req_failed: ['rate==0'],
            time_per_token: [{
-                threshold: `p(50)<${3 * reference_latency_ms}`,
+                threshold: `p(50)<${5 * reference_latency_ms}`,
                abortOnFail: true,
                delayAbortEval: '10s'
            }],
@ -28,7 +30,7 @@ export function get_options(reference_latency_ms){
            load_test: {
                executor: 'constant-arrival-rate',
                duration: '60s',
-                preAllocatedVUs: 100,
+                preAllocatedVUs: 10,
                rate: 10,
                timeUnit: '1s',
            },
@ -48,17 +50,22 @@ export function run(host, generate_payload, max_new_tokens) {
        return;
    }

+
    check(res, {
        'Post status is 200': (r) => res.status === 200,
    });
-    const n_tokens = max_new_tokens;
-    const timings = res.timings.duration;
+    const duration = res.timings.duration;

    if (res.status === 200) {
-        const latency_ms_per_token = timings / n_tokens;
+        const body = res.json(); 
+        const n_tokens = body.details.tokens.length;
+        const latency_ms_per_token = duration / n_tokens;
        timePerToken.add(latency_ms_per_token);
        const latency_in_s = latency_ms_per_token / 1000;
        const individual_throughput = 1 / latency_in_s;
-        throughput.add(individual_throughput);
+        const _input_tokens = body.details.prefill.length;
+        tokens.add(n_tokens + _input_tokens);
+        input_tokens.add(_input_tokens);
+        new_tokens.add(n_tokens);
    }
 }
--- a/load_tests/tgi.js
+++ b/load_tests/tgi.js
@ -1,13 +1,13 @@
 import { get_options, run } from "./common.js";
 
-const reference_latency_ms = 30;
+const reference_latency_ms = 70;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const max_new_tokens = 50;


 function generate_payload(gpt){
    const input = gpt["conversations"][0]["value"];
-    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
+    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "decoder_input_details": true}}
 }

 export const options = get_options(reference_latency_ms);
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -820,10 +820,20 @@ class FlashCausalLM(Model):
        else:
            next_token_logits = out

+        # import datetime
+        # from loguru import logger

+        # start = datetime.datetime.now()
        next_input_ids, next_token_logprobs, logprobs, accepted_ids, speculative_ids = batch.next_token_chooser(
            batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits, get_speculate(), batch.speculative_ids, speculative_logits
        )
+        # took = datetime.datetime.now() - start
+        # logger.info(f"Next token chooser {batch.all_input_ids_tensor.shape} took {took}")
+        # if batch.all_input_ids_tensor.shape[1] < 2000 and took > datetime.timedelta(milliseconds=5):
+        #     next_input_ids, next_token_logprobs, logprobs, accepted_ids, speculative_ids = batch.next_token_chooser(
+        #         batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits, get_speculate(), batch.speculative_ids, speculative_logits, verbose=True
+        #     )
+        #     import ipdb;ipdb.set_trace()

        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs
--- a/server/text_generation_server/utils/medusa.py
+++ b/server/text_generation_server/utils/medusa.py
@ -33,7 +33,7 @@ class MedusaModel(torch.nn.Module):

    def forward(self, x):
        logits = self.lm_head(x)
-        speculative_logits =  torch.stack([head(x) for head in self.heads], dim=1)
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
        return logits, speculative_logits