Moving to uv instead of poetry. (#2919)

* Moving to `uv` instead of `poetry`. More in the standard, faster, seemingly better lockfile. * Creating venv if not created. * Create the venv. * Fix ? * Fixing the test by activating the environment ? * Install system ? * Add the cli entry point. * docker install on system * Monkeying this... * `--system` is redundant. * Trying to force-include this pb folder. * TRying to check that pb is imported correctly. * Editable install necessary ? * Non editable? * Editable it is.
2025-07-10 18:00:16 +00:00 · 2025-01-17 12:32:00 +01:00 · 2025-01-17 12:32:00 +01:00 · de19e7e844
commit de19e7e844
parent d61f14f271
9 changed files with 3536 additions and 4208 deletions
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -44,10 +44,14 @@ jobs:
        run: |
          sudo apt update
          sudo apt install python3.11-dev -y
+          pip install -U pip uv
+          uv venv
+          source ./.venv/bin/activate
          make install-cpu
      - name: Run server tests
        run: |
-          pip install pytest
+          source ./.venv/bin/activate
+          uv pip install pytest
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv server/tests
      - name: Pre-commit checks
--- a/10
+++ b/10
@ -224,17 +224,19 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
 COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/

 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+# RUN pip install einops --no-cache-dir

 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_cuda.txt && \
-    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
-    pip install nvidia-nccl-cu12==2.22.3
+    python -c "from text_generation_server.pb import generate_pb2" && \
+    pip install -U pip uv && \
+    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    uv pip install nvidia-nccl-cu12==2.22.3

 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
--- a/7
+++ b/7
@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
    /opt/conda/bin/conda clean -ya

 # Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*

 RUN conda install mkl=2021
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@ -318,10 +318,11 @@ COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/10
+++ b/10
@ -108,10 +108,11 @@ RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
@ -211,10 +212,11 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/flake.lock
+++ b/flake.lock
@ -853,11 +853,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1736907983,
-        "narHash": "sha256-fw55wVwpJW36Md2HZBKuxX3YHGeqsGsspPLtCMVr1Y8=",
+        "lastModified": 1736994333,
+        "narHash": "sha256-v4Jrok5yXsZ6dwj2+2uo5cSyUi9fBTurHqHvNHLT1XA=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "eaa365c911441e07e387ff6acc596619fc50b156",
+        "rev": "848db855cb9e88785996e961951659570fc58814",
        "type": "github"
      },
      "original": {
--- a/server/Makefile
+++ b/server/Makefile
@ -9,11 +9,14 @@ include Makefile-exllamav2
 include Makefile-flashinfer

 unit-tests:
+	pip install -U pip uv
+	uv pip install -e ".[dev]"
 	pytest -s -vv -m "not private" tests

 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	pip install -U pip uv
+	uv pip install ".[gen]"
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@ -21,24 +24,14 @@ gen-server:
 	touch text_generation_server/pb/__init__.py

 install-server: gen-server
-	pip install pip --upgrade
-	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
+	uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"


 install: install-cuda
 	echo "Installed server"

 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	pip install -e ".[attention,bnb,marlin,moe]"
-	pip install nvidia-nccl-cu12==2.22.3
+	uv pip install -e ".[attention,bnb,marlin,moe]"
+	uv pip install nvidia-nccl-cu12==2.22.3

 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
-
-run-dev:
-	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
-
-export-requirements:
-	poetry export -o requirements_cuda.txt --without-hashes
-	poetry export -o requirements_rocm.txt --without-hashes
-	poetry export -o requirements_intel.txt --without-hashes
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,96 +1,97 @@
-[tool.poetry]
+[project]
 name = "text-generation-server"
 version = "2.0.5-dev0"
 description = "Text Generation Inference Python gRPC Server"
-authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+readme = "README.md"
+requires-python = ">=3.9"
+authors = [
+  {name = "Olivier Dehaene", email = "olivier@huggingface.co"},
+  {name = "Nicolas Patry", email = "nicolas@huggingface.co"},
+]
+dependencies = [
+    "einops>=0.8.0",
+    "grpc-interceptor>=0.15.4",
+    "grpcio>=1.67.0",
+    "grpcio-reflection>=1.67.0",
+    "grpcio-status>=1.67.0",
+    "hf-transfer>=0.1.8",
+    "loguru>=0.7.3",
+    "numpy>=1.26,<3",
+    "opentelemetry-api>=1.27.0",
+    "opentelemetry-exporter-otlp>=1.27.0",
+    "opentelemetry-instrumentation-grpc>=0.50b0",
+    "pillow>=11.1.0",
+    "prometheus-client>=0.21.0",
+    "protobuf>=5.28.3",
+    "py-cpuinfo>=9.0.0",
+    "rich>=13.8.1",
+    "safetensors>=0.4.5",
+    "scipy>=1.13.1",
+    "sentencepiece>=0.2.0",
+    "tokenizers>=0.20.3",
+    "typer>=0.15.1",
+]

-[tool.poetry.scripts]
-text-generation-server = 'text_generation_server.cli:app'
+[project.scripts]
+text-generation-server = "text_generation_server.cli:app"

-[tool.poetry.dependencies]
-python = ">=3.9,<3.13"
-protobuf = ">=4.25.3,<6"
-grpcio = "^1.51.1"
-grpcio-status = "^1.51.1"
-grpcio-reflection = "^1.51.1"
-grpc-interceptor = "^0.15.4"
-typer = "^0.12.5"
-accelerate = {version = "^1.1.0", optional = true}
-bitsandbytes = { version = "^0.45.0", optional = true }
-safetensors = "^0.4.5"
-loguru = "^0.7.2"
-opentelemetry-api = "^1.27.0"
-opentelemetry-exporter-otlp = "^1.27.0"
-opentelemetry-instrumentation-grpc = "^0.48b0"
-hf-transfer = "^0.1.2"
-sentencepiece = "^0.2.0"
-tokenizers = "^0.20.3"
-huggingface-hub = "^0.23"
-transformers = "^4.46.2"
-einops = "^0.8.0"
-texttable = { version = "^1.6.7", optional = true }
-datasets = {version = "^2.21.0", optional = true}
-peft = {version = "^0.13.2", optional = true}
-torch = {version = "^2.4.1", optional = true}
-scipy = "^1.13.1"
-pillow = "^11.0.0"
-outlines= {version = "^0.1.3", optional = true}
-prometheus-client = ">=0.20.0,<0.22"
-py-cpuinfo = "^9.0.0"
-compressed-tensors = {version = "^0.7.1", optional = true}
-# Remove later, temporary workaround for outlines.
-numpy = "^1.26.4"
+[project.optional-dependencies]
+accelerate = [
+    "accelerate>=1.2.1,<2",
+]
+bnb = [
+    "bitsandbytes>=0.45.0",
+]
+compressed-tensors = [
+    "compressed-tensors>=0.9.0",
+]
+peft = [
+    "peft>=0.14.0",
+]
+outlines = [
+    "outlines>=0.1.13",
+]
+dev = [
+    "grpcio-tools>=1.51.1,<2.0",
+    "pytest>=7.3.0,<8"
+]
+quantize = [
+    "texttable>=1.6.7,<2",
+    "datasets>=2.21,<3",
+]
+moe = [ "moe-kernels" ]
+attention = [ "attention-kernels" ]
+marlin = [ "marlin-kernels" ]
+gen = [
+    "grpcio-tools>=1.69.0",
+    "mypy-protobuf>=3.6.0",
+]

+[tool.uv.sources]
 attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-rich = "^13.8.1"
-
-[tool.poetry.extras]
-torch = ["torch"]
-accelerate = ["accelerate"]
-attention = ["attention-kernels"]
-bnb = ["bitsandbytes"]
-compressed-tensors = ["compressed-tensors"]
-marlin = ["marlin-kernels"]
-moe = ["moe-kernels"]
-peft = ["peft"]
-quantize = ["texttable", "datasets", "accelerate"]
-outlines = ["outlines"]
-
-[tool.poetry.group.dev.dependencies]
-grpcio-tools = "^1.51.1"
-pytest = "^7.3.0"
-
-
-[[tool.poetry.source]]
-name = "pytorch-gpu-src"
-url = "https://download.pytorch.org/whl/cu121"
-priority = "explicit"

 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]

-[build-system]
-requires = [
-    "poetry-core>=1.0.0",
-]
-build-backend = "poetry.core.masonry.api"
-
 [tool.isort]
 profile = "black"
+
+[tool.setuptools.packages.find]
+include = ["text_generation_server*"]
--- a/server/uv.lock
+++ b/server/uv.lock