Moving to uv instead of poetry.

More in the standard, faster, seemingly better lockfile.
2025-09-11 12:24:53 +00:00 · 2025-01-16 13:43:16 +01:00 · 2025-01-16 13:43:16 +01:00 · 9177fbfda6
commit 9177fbfda6
parent 5f78ec32a5
7 changed files with 3521 additions and 4205 deletions
--- a/8
+++ b/8
@ -224,7 +224,7 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
 COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+# RUN pip install einops --no-cache-dir
 # Install server
 COPY proto proto
@ -232,9 +232,9 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_cuda.txt && \
+    pip install -U pip uv && \
-    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    uv pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
-    pip install nvidia-nccl-cu12==2.22.3
+    uv pip install nvidia-nccl-cu12==2.22.3
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
--- a/6
+++ b/6
@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
    /opt/conda/bin/conda clean -ya
 # Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --upgrade pip uv && uv pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
 RUN conda install mkl=2021
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@ -320,8 +320,8 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_rocm.txt && \
+    pip install -U pip uv && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/8
+++ b/8
@ -110,8 +110,8 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_intel.txt && \
+    uv pip install -U pip uv && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
@ -213,8 +213,8 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements_intel.txt && \
+    pip install -U pip uv && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/server/Makefile
+++ b/server/Makefile
@ -9,11 +9,14 @@ include Makefile-exllamav2
 include Makefile-flashinfer
 unit-tests:
 	pip install -U pip uv
 	uv pip install -e ".[dev]"
 	pytest -s -vv -m "not private" tests
 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	pip install -U pip uv
 	uv pip install ".[gen]"
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@ -21,24 +24,14 @@ gen-server:
 	touch text_generation_server/pb/__init__.py
 install-server: gen-server
-	pip install pip --upgrade
+	uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 	pip install -r requirements_cuda.txt
 	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 install: install-cuda
 	echo "Installed server"
 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	pip install -e ".[attention,bnb,marlin,moe]"
+	uv pip install -e ".[attention,bnb,marlin,moe]"
-	pip install nvidia-nccl-cu12==2.22.3
+	uv pip install nvidia-nccl-cu12==2.22.3
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
 export-requirements:
 	poetry export -o requirements_cuda.txt --without-hashes
 	poetry export -o requirements_rocm.txt --without-hashes
 	poetry export -o requirements_intel.txt --without-hashes
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,96 +1,94 @@
-[tool.poetry]
+[project]
 name = "text-generation-server"
 version = "2.0.5-dev0"
 description = "Text Generation Inference Python gRPC Server"
-authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+readme = "README.md"
 requires-python = ">=3.9"
 authors = [
  {name = "Olivier Dehaene", email = "olivier@huggingface.co"},
  {name = "Nicolas Patry", email = "nicolas@huggingface.co"},
 ]
 dependencies = [
    "einops>=0.8.0",
    "grpc-interceptor>=0.15.4",
    "grpcio>=1.69.0",
    "grpcio-reflection>=1.69.0",
    "grpcio-status>=1.69.0",
    "hf-transfer>=0.1.9",
    "loguru>=0.7.3",
    "numpy>=2.0.2",
    "opentelemetry-api>=1.29.0",
    "opentelemetry-exporter-otlp>=1.29.0",
    "opentelemetry-instrumentation-grpc>=0.50b0",
    "pillow>=11.1.0",
    "prometheus-client>=0.21.1",
    "protobuf>=5.29.3",
    "py-cpuinfo>=9.0.0",
    "rich>=13.9.4",
    "safetensors>=0.5.2",
    "scipy>=1.13.1",
    "sentencepiece>=0.2.0",
    "tokenizers>=0.21.0",
    "typer>=0.15.1",
 ]
-[tool.poetry.scripts]
+[project.optional-dependencies]
-text-generation-server = 'text_generation_server.cli:app'
+accelerate = [
-
+    "accelerate>=1.2.1,<2",
-[tool.poetry.dependencies]
+]
-python = ">=3.9,<3.13"
+bnb = [
-protobuf = ">=4.25.3,<6"
+    "bitsandbytes>=0.45.0",
-grpcio = "^1.51.1"
+]
-grpcio-status = "^1.51.1"
+compressed-tensors = [
-grpcio-reflection = "^1.51.1"
+    "compressed-tensors>=0.9.0",
-grpc-interceptor = "^0.15.4"
+]
-typer = "^0.12.5"
+peft = [
-accelerate = {version = "^1.1.0", optional = true}
+    "peft>=0.14.0",
-bitsandbytes = { version = "^0.45.0", optional = true }
+]
-safetensors = "^0.4.5"
+outlines = [
-loguru = "^0.7.2"
+    "outlines>=0.1.13",
-opentelemetry-api = "^1.27.0"
+]
-opentelemetry-exporter-otlp = "^1.27.0"
+dev = [
-opentelemetry-instrumentation-grpc = "^0.48b0"
+    "grpcio-tools>=1.51.1,<2.0",
-hf-transfer = "^0.1.2"
+    "pytest>=7.3.0,<8"
-sentencepiece = "^0.2.0"
+]
-tokenizers = "^0.20.3"
+quantize = [
-huggingface-hub = "^0.23"
+    "texttable>=1.6.7,<2",
-transformers = "^4.46.2"
+    "datasets>=2.21,<3",
-einops = "^0.8.0"
+]
-texttable = { version = "^1.6.7", optional = true }
+moe = [ "moe-kernels" ]
-datasets = {version = "^2.21.0", optional = true}
+attention = [ "attention-kernels" ]
-peft = {version = "^0.13.2", optional = true}
+marlin = [ "marlin-kernels" ]
-torch = {version = "^2.4.1", optional = true}
+gen = [
-scipy = "^1.13.1"
+    "grpcio-tools>=1.69.0",
-pillow = "^11.0.0"
+    "mypy-protobuf>=3.6.0",
-outlines= {version = "^0.1.3", optional = true}
+]
 prometheus-client = ">=0.20.0,<0.22"
 py-cpuinfo = "^9.0.0"
 compressed-tensors = {version = "^0.7.1", optional = true}
 # Remove later, temporary workaround for outlines.
 numpy = "^1.26.4"
 [tool.uv.sources]
 attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 rich = "^13.8.1"
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 attention = ["attention-kernels"]
 bnb = ["bitsandbytes"]
 compressed-tensors = ["compressed-tensors"]
 marlin = ["marlin-kernels"]
 moe = ["moe-kernels"]
 peft = ["peft"]
 quantize = ["texttable", "datasets", "accelerate"]
 outlines = ["outlines"]
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.51.1"
 pytest = "^7.3.0"
 [[tool.poetry.source]]
 name = "pytorch-gpu-src"
 url = "https://download.pytorch.org/whl/cu121"
 priority = "explicit"
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 [build-system]
 requires = [
    "poetry-core>=1.0.0",
 ]
 build-backend = "poetry.core.masonry.api"
 [tool.isort]
 profile = "black"
 [tool.setuptools]
 py-modules = ["text-generation-server"]
--- a/server/uv.lock
+++ b/server/uv.lock