mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-07 09:52:18 +00:00
Update server tests
- Default to throughput test in k6 - Use TGI_WIGGLE_ROOM to adjust wiggle room
This commit is contained in:
parent
12325564dc
commit
8d01848370
@ -33,13 +33,13 @@ export function get_options() {
|
|||||||
// rate: 20,
|
// rate: 20,
|
||||||
// timeUnit: '1s',
|
// timeUnit: '1s',
|
||||||
// },
|
// },
|
||||||
load_test: {
|
// load_test: {
|
||||||
executor: 'constant-arrival-rate',
|
// executor: 'constant-arrival-rate',
|
||||||
duration: '60s',
|
// duration: '60s',
|
||||||
preAllocatedVUs: 100,
|
// preAllocatedVUs: 100,
|
||||||
rate: 1,
|
// rate: 1,
|
||||||
timeUnit: '1s',
|
// timeUnit: '1s',
|
||||||
},
|
// },
|
||||||
// breakpoint: {
|
// breakpoint: {
|
||||||
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
||||||
// preAllocatedVUs: 300,
|
// preAllocatedVUs: 300,
|
||||||
@ -47,12 +47,12 @@ export function get_options() {
|
|||||||
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
||||||
// ],
|
// ],
|
||||||
// },
|
// },
|
||||||
// throughput: {
|
throughput: {
|
||||||
// executor: 'shared-iterations',
|
executor: 'shared-iterations',
|
||||||
// vus: 100,
|
vus: 100,
|
||||||
// iterations: 200,
|
iterations: 200,
|
||||||
// maxDuration: '40s',
|
maxDuration: '40s',
|
||||||
// },
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,8 @@ import pytest
|
|||||||
import os
|
import os
|
||||||
from text_generation_server.pb import generate_pb2
|
from text_generation_server.pb import generate_pb2
|
||||||
|
|
||||||
os.environ["USE_PREFIX_CACHING"] = "0"
|
os.environ["USE_PREFIX_CACHING"] = "1"
|
||||||
os.environ["ATTENTION"] = "flashdecoding"
|
os.environ["ATTENTION"] = "flashinfer"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -43,6 +43,7 @@ from text_generation_server.models.globals import (
|
|||||||
ATTENTION,
|
ATTENTION,
|
||||||
BLOCK_SIZE,
|
BLOCK_SIZE,
|
||||||
CUDA_GRAPHS,
|
CUDA_GRAPHS,
|
||||||
|
TGI_WIGGLE_ROOM,
|
||||||
get_adapter_to_index,
|
get_adapter_to_index,
|
||||||
)
|
)
|
||||||
from text_generation_server.layers.attention import Seqlen
|
from text_generation_server.layers.attention import Seqlen
|
||||||
@ -1283,7 +1284,7 @@ class FlashCausalLM(Model):
|
|||||||
|
|
||||||
num_blocks = (
|
num_blocks = (
|
||||||
# Leave 5% for some wiggle room
|
# Leave 5% for some wiggle room
|
||||||
int((free_memory * 0.95) // total_cache_size)
|
int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
|
||||||
# Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
|
# Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
|
||||||
+ batch_num_blocks
|
+ batch_num_blocks
|
||||||
)
|
)
|
||||||
|
@ -14,10 +14,13 @@ assert (
|
|||||||
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
), f"Attention is not valid {ATTENTION}, expected {_expected}"
|
||||||
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
log_master(logger.info, f"Using Attention = {ATTENTION}")
|
||||||
|
|
||||||
# if PREFIX_CACHING and ATTENTION != "flashinfer":
|
if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
|
||||||
# raise RuntimeError("Prefix caching is only supported with flashinfer")
|
raise RuntimeError("Prefix caching is only supported with flashinfer")
|
||||||
|
|
||||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||||
|
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
|
||||||
|
assert TGI_WIGGLE_ROOM > 0
|
||||||
|
assert TGI_WIGGLE_ROOM < 1
|
||||||
|
|
||||||
# This is overridden by the cli
|
# This is overridden by the cli
|
||||||
BLOCK_SIZE: int
|
BLOCK_SIZE: int
|
||||||
|
Loading…
Reference in New Issue
Block a user