Moving to uv instead of poetry. (#2919)

* Moving to `uv` instead of `poetry`.

More in the standard, faster, seemingly better lockfile.

* Creating venv if not created.

* Create the venv.

* Fix ?

* Fixing the test by activating the environment ?

* Install system  ?

* Add the cli entry point.

* docker install on system

* Monkeying this...

* `--system` is redundant.

* Trying to force-include this pb folder.

* TRying to check that pb is imported correctly.

* Editable install necessary ?

* Non editable?

* Editable it is.
This commit is contained in:
Nicolas Patry 2025-01-17 12:32:00 +01:00 committed by GitHub
parent d61f14f271
commit de19e7e844
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 3536 additions and 4208 deletions

View File

@ -44,10 +44,14 @@ jobs:
run: | run: |
sudo apt update sudo apt update
sudo apt install python3.11-dev -y sudo apt install python3.11-dev -y
pip install -U pip uv
uv venv
source ./.venv/bin/activate
make install-cpu make install-cpu
- name: Run server tests - name: Run server tests
run: | run: |
pip install pytest source ./.venv/bin/activate
uv pip install pytest
export HF_TOKEN=${{ secrets.HF_TOKEN }} export HF_TOKEN=${{ secrets.HF_TOKEN }}
pytest -s -vv server/tests pytest -s -vv server/tests
- name: Pre-commit checks - name: Pre-commit checks

View File

@ -224,17 +224,19 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/ COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
# Install flash-attention dependencies # Install flash-attention dependencies
RUN pip install einops --no-cache-dir # RUN pip install einops --no-cache-dir
# Install server # Install server
COPY proto proto COPY proto proto
COPY server server COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_cuda.txt && \ python -c "from text_generation_server.pb import generate_pb2" && \
pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ pip install -U pip uv && \
pip install nvidia-nccl-cu12==2.22.3 uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
uv pip install nvidia-nccl-cu12==2.22.3
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries # Required to find libpython within the rust binaries

View File

@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
/opt/conda/bin/conda clean -ya /opt/conda/bin/conda clean -ya
# Install flash-attention, torch dependencies # Install flash-attention, torch dependencies
RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
RUN conda install mkl=2021 RUN conda install mkl=2021
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@ -318,10 +318,11 @@ COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311
COPY proto proto COPY proto proto
COPY server server COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_rocm.txt && \ pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -108,10 +108,11 @@ RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
COPY proto proto COPY proto proto
COPY server server COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_intel.txt && \ pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV CCL_ZE_IPC_EXCHANGE=sockets ENV CCL_ZE_IPC_EXCHANGE=sockets
@ -211,10 +212,11 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
COPY proto proto COPY proto proto
COPY server server COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_intel.txt && \ pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -853,11 +853,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1736907983, "lastModified": 1736994333,
"narHash": "sha256-fw55wVwpJW36Md2HZBKuxX3YHGeqsGsspPLtCMVr1Y8=", "narHash": "sha256-v4Jrok5yXsZ6dwj2+2uo5cSyUi9fBTurHqHvNHLT1XA=",
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "eaa365c911441e07e387ff6acc596619fc50b156", "rev": "848db855cb9e88785996e961951659570fc58814",
"type": "github" "type": "github"
}, },
"original": { "original": {

View File

@ -9,11 +9,14 @@ include Makefile-exllamav2
include Makefile-flashinfer include Makefile-flashinfer
unit-tests: unit-tests:
pip install -U pip uv
uv pip install -e ".[dev]"
pytest -s -vv -m "not private" tests pytest -s -vv -m "not private" tests
gen-server: gen-server:
# Compile protos # Compile protos
pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir pip install -U pip uv
uv pip install ".[gen]"
mkdir text_generation_server/pb || true mkdir text_generation_server/pb || true
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \ python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@ -21,24 +24,14 @@ gen-server:
touch text_generation_server/pb/__init__.py touch text_generation_server/pb/__init__.py
install-server: gen-server install-server: gen-server
pip install pip --upgrade uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
pip install -r requirements_cuda.txt
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
install: install-cuda install: install-cuda
echo "Installed server" echo "Installed server"
install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
pip install -e ".[attention,bnb,marlin,moe]" uv pip install -e ".[attention,bnb,marlin,moe]"
pip install nvidia-nccl-cu12==2.22.3 uv pip install nvidia-nccl-cu12==2.22.3
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
run-dev:
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
export-requirements:
poetry export -o requirements_cuda.txt --without-hashes
poetry export -o requirements_rocm.txt --without-hashes
poetry export -o requirements_intel.txt --without-hashes

4101
server/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,96 +1,97 @@
[tool.poetry] [project]
name = "text-generation-server" name = "text-generation-server"
version = "2.0.5-dev0" version = "2.0.5-dev0"
description = "Text Generation Inference Python gRPC Server" description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"] readme = "README.md"
requires-python = ">=3.9"
authors = [
{name = "Olivier Dehaene", email = "olivier@huggingface.co"},
{name = "Nicolas Patry", email = "nicolas@huggingface.co"},
]
dependencies = [
"einops>=0.8.0",
"grpc-interceptor>=0.15.4",
"grpcio>=1.67.0",
"grpcio-reflection>=1.67.0",
"grpcio-status>=1.67.0",
"hf-transfer>=0.1.8",
"loguru>=0.7.3",
"numpy>=1.26,<3",
"opentelemetry-api>=1.27.0",
"opentelemetry-exporter-otlp>=1.27.0",
"opentelemetry-instrumentation-grpc>=0.50b0",
"pillow>=11.1.0",
"prometheus-client>=0.21.0",
"protobuf>=5.28.3",
"py-cpuinfo>=9.0.0",
"rich>=13.8.1",
"safetensors>=0.4.5",
"scipy>=1.13.1",
"sentencepiece>=0.2.0",
"tokenizers>=0.20.3",
"typer>=0.15.1",
]
[tool.poetry.scripts] [project.scripts]
text-generation-server = 'text_generation_server.cli:app' text-generation-server = "text_generation_server.cli:app"
[tool.poetry.dependencies] [project.optional-dependencies]
python = ">=3.9,<3.13" accelerate = [
protobuf = ">=4.25.3,<6" "accelerate>=1.2.1,<2",
grpcio = "^1.51.1" ]
grpcio-status = "^1.51.1" bnb = [
grpcio-reflection = "^1.51.1" "bitsandbytes>=0.45.0",
grpc-interceptor = "^0.15.4" ]
typer = "^0.12.5" compressed-tensors = [
accelerate = {version = "^1.1.0", optional = true} "compressed-tensors>=0.9.0",
bitsandbytes = { version = "^0.45.0", optional = true } ]
safetensors = "^0.4.5" peft = [
loguru = "^0.7.2" "peft>=0.14.0",
opentelemetry-api = "^1.27.0" ]
opentelemetry-exporter-otlp = "^1.27.0" outlines = [
opentelemetry-instrumentation-grpc = "^0.48b0" "outlines>=0.1.13",
hf-transfer = "^0.1.2" ]
sentencepiece = "^0.2.0" dev = [
tokenizers = "^0.20.3" "grpcio-tools>=1.51.1,<2.0",
huggingface-hub = "^0.23" "pytest>=7.3.0,<8"
transformers = "^4.46.2" ]
einops = "^0.8.0" quantize = [
texttable = { version = "^1.6.7", optional = true } "texttable>=1.6.7,<2",
datasets = {version = "^2.21.0", optional = true} "datasets>=2.21,<3",
peft = {version = "^0.13.2", optional = true} ]
torch = {version = "^2.4.1", optional = true} moe = [ "moe-kernels" ]
scipy = "^1.13.1" attention = [ "attention-kernels" ]
pillow = "^11.0.0" marlin = [ "marlin-kernels" ]
outlines= {version = "^0.1.3", optional = true} gen = [
prometheus-client = ">=0.20.0,<0.22" "grpcio-tools>=1.69.0",
py-cpuinfo = "^9.0.0" "mypy-protobuf>=3.6.0",
compressed-tensors = {version = "^0.7.1", optional = true} ]
# Remove later, temporary workaround for outlines.
numpy = "^1.26.4"
[tool.uv.sources]
attention-kernels = [ attention-kernels = [
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
marlin-kernels = [ marlin-kernels = [
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
moe-kernels = [ moe-kernels = [
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
rich = "^13.8.1"
[tool.poetry.extras]
torch = ["torch"]
accelerate = ["accelerate"]
attention = ["attention-kernels"]
bnb = ["bitsandbytes"]
compressed-tensors = ["compressed-tensors"]
marlin = ["marlin-kernels"]
moe = ["moe-kernels"]
peft = ["peft"]
quantize = ["texttable", "datasets", "accelerate"]
outlines = ["outlines"]
[tool.poetry.group.dev.dependencies]
grpcio-tools = "^1.51.1"
pytest = "^7.3.0"
[[tool.poetry.source]]
name = "pytorch-gpu-src"
url = "https://download.pytorch.org/whl/cu121"
priority = "explicit"
[tool.pytest.ini_options] [tool.pytest.ini_options]
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"] markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
[build-system]
requires = [
"poetry-core>=1.0.0",
]
build-backend = "poetry.core.masonry.api"
[tool.isort] [tool.isort]
profile = "black" profile = "black"
[tool.setuptools.packages.find]
include = ["text_generation_server*"]

3426
server/uv.lock Normal file

File diff suppressed because it is too large Load Diff