diff --git a/server/Makefile b/server/Makefile index 3abc917e..cf6c7370 100644 --- a/server/Makefile +++ b/server/Makefile @@ -10,6 +10,7 @@ include Makefile-flashinfer unit-tests: pip install -U pip uv uv pip install -e ".[dev]" + uv sync --inexact --extra dev --active pytest -s -vv -m "not private" tests gen-server: @@ -30,14 +31,14 @@ gen-server-raw: touch text_generation_server/pb/__init__.py install-server: gen-server - uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]" + uv sync --inexact --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active install: install-cuda echo "Installed server" install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention - uv pip install -e ".[attention,bnb,marlin,moe]" + uv sync --inexact --extra attention --extra bnb --extra marlin --extra moe --active uv pip install nvidia-nccl-cu12==2.22.3 kernels download . diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 19696372..61ff6a13 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -28,7 +28,8 @@ if PREFIX_CACHING and ATTENTION not in { raise RuntimeError("Prefix caching is only supported with flashinfer") MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None -TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93")) +# Test a 70B model on 4xA100 under load for latest failure +TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90")) assert TGI_WIGGLE_ROOM > 0 assert TGI_WIGGLE_ROOM < 1