mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-27 04:52:07 +00:00
Put more wiggle room. (#3189)
* Put more wiggle room. * Fixing the makefile by using lockfile. * Pre commit
This commit is contained in:
parent
375802948d
commit
39cfe232fd
@ -10,6 +10,7 @@ include Makefile-flashinfer
|
||||
unit-tests:
|
||||
pip install -U pip uv
|
||||
uv pip install -e ".[dev]"
|
||||
uv sync --inexact --extra dev --active
|
||||
pytest -s -vv -m "not private" tests
|
||||
|
||||
gen-server:
|
||||
@ -30,14 +31,14 @@ gen-server-raw:
|
||||
touch text_generation_server/pb/__init__.py
|
||||
|
||||
install-server: gen-server
|
||||
uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
|
||||
uv sync --inexact --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active
|
||||
|
||||
|
||||
install: install-cuda
|
||||
echo "Installed server"
|
||||
|
||||
install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
|
||||
uv pip install -e ".[attention,bnb,marlin,moe]"
|
||||
uv sync --inexact --extra attention --extra bnb --extra marlin --extra moe --active
|
||||
uv pip install nvidia-nccl-cu12==2.22.3
|
||||
kernels download .
|
||||
|
||||
|
@ -28,7 +28,8 @@ if PREFIX_CACHING and ATTENTION not in {
|
||||
raise RuntimeError("Prefix caching is only supported with flashinfer")
|
||||
|
||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93"))
|
||||
# Test a 70B model on 4xA100 under load for latest failure
|
||||
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
|
||||
assert TGI_WIGGLE_ROOM > 0
|
||||
assert TGI_WIGGLE_ROOM < 1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user