Moving to uv instead of poetry.

More in the standard, faster, seemingly better lockfile.
This commit is contained in:
Nicolas Patry 2025-01-16 13:43:16 +01:00
parent 5f78ec32a5
commit 9177fbfda6
No known key found for this signature in database
GPG Key ID: D2920555C90F704C
7 changed files with 3521 additions and 4205 deletions

View File

@ -224,7 +224,7 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/ COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
# Install flash-attention dependencies # Install flash-attention dependencies
RUN pip install einops --no-cache-dir # RUN pip install einops --no-cache-dir
# Install server # Install server
COPY proto proto COPY proto proto
@ -232,9 +232,9 @@ COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_cuda.txt && \ pip install -U pip uv && \
pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \ uv pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
pip install nvidia-nccl-cu12==2.22.3 uv pip install nvidia-nccl-cu12==2.22.3
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries # Required to find libpython within the rust binaries

View File

@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
/opt/conda/bin/conda clean -ya /opt/conda/bin/conda clean -ya
# Install flash-attention, torch dependencies # Install flash-attention, torch dependencies
RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --upgrade pip uv && uv pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
RUN conda install mkl=2021 RUN conda install mkl=2021
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@ -320,8 +320,8 @@ COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_rocm.txt && \ pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -110,8 +110,8 @@ COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_intel.txt && \ uv pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV CCL_ZE_IPC_EXCHANGE=sockets ENV CCL_ZE_IPC_EXCHANGE=sockets
@ -213,8 +213,8 @@ COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_intel.txt && \ pip install -U pip uv && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir uv pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -9,11 +9,14 @@ include Makefile-exllamav2
include Makefile-flashinfer include Makefile-flashinfer
unit-tests: unit-tests:
pip install -U pip uv
uv pip install -e ".[dev]"
pytest -s -vv -m "not private" tests pytest -s -vv -m "not private" tests
gen-server: gen-server:
# Compile protos # Compile protos
pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir pip install -U pip uv
uv pip install ".[gen]"
mkdir text_generation_server/pb || true mkdir text_generation_server/pb || true
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \ python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@ -21,24 +24,14 @@ gen-server:
touch text_generation_server/pb/__init__.py touch text_generation_server/pb/__init__.py
install-server: gen-server install-server: gen-server
pip install pip --upgrade uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
pip install -r requirements_cuda.txt
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
install: install-cuda install: install-cuda
echo "Installed server" echo "Installed server"
install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
pip install -e ".[attention,bnb,marlin,moe]" uv pip install -e ".[attention,bnb,marlin,moe]"
pip install nvidia-nccl-cu12==2.22.3 uv pip install nvidia-nccl-cu12==2.22.3
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
run-dev:
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
export-requirements:
poetry export -o requirements_cuda.txt --without-hashes
poetry export -o requirements_rocm.txt --without-hashes
poetry export -o requirements_intel.txt --without-hashes

4101
server/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,96 +1,94 @@
[tool.poetry] [project]
name = "text-generation-server" name = "text-generation-server"
version = "2.0.5-dev0" version = "2.0.5-dev0"
description = "Text Generation Inference Python gRPC Server" description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"] readme = "README.md"
requires-python = ">=3.9"
authors = [
{name = "Olivier Dehaene", email = "olivier@huggingface.co"},
{name = "Nicolas Patry", email = "nicolas@huggingface.co"},
]
dependencies = [
"einops>=0.8.0",
"grpc-interceptor>=0.15.4",
"grpcio>=1.69.0",
"grpcio-reflection>=1.69.0",
"grpcio-status>=1.69.0",
"hf-transfer>=0.1.9",
"loguru>=0.7.3",
"numpy>=2.0.2",
"opentelemetry-api>=1.29.0",
"opentelemetry-exporter-otlp>=1.29.0",
"opentelemetry-instrumentation-grpc>=0.50b0",
"pillow>=11.1.0",
"prometheus-client>=0.21.1",
"protobuf>=5.29.3",
"py-cpuinfo>=9.0.0",
"rich>=13.9.4",
"safetensors>=0.5.2",
"scipy>=1.13.1",
"sentencepiece>=0.2.0",
"tokenizers>=0.21.0",
"typer>=0.15.1",
]
[tool.poetry.scripts] [project.optional-dependencies]
text-generation-server = 'text_generation_server.cli:app' accelerate = [
"accelerate>=1.2.1,<2",
[tool.poetry.dependencies] ]
python = ">=3.9,<3.13" bnb = [
protobuf = ">=4.25.3,<6" "bitsandbytes>=0.45.0",
grpcio = "^1.51.1" ]
grpcio-status = "^1.51.1" compressed-tensors = [
grpcio-reflection = "^1.51.1" "compressed-tensors>=0.9.0",
grpc-interceptor = "^0.15.4" ]
typer = "^0.12.5" peft = [
accelerate = {version = "^1.1.0", optional = true} "peft>=0.14.0",
bitsandbytes = { version = "^0.45.0", optional = true } ]
safetensors = "^0.4.5" outlines = [
loguru = "^0.7.2" "outlines>=0.1.13",
opentelemetry-api = "^1.27.0" ]
opentelemetry-exporter-otlp = "^1.27.0" dev = [
opentelemetry-instrumentation-grpc = "^0.48b0" "grpcio-tools>=1.51.1,<2.0",
hf-transfer = "^0.1.2" "pytest>=7.3.0,<8"
sentencepiece = "^0.2.0" ]
tokenizers = "^0.20.3" quantize = [
huggingface-hub = "^0.23" "texttable>=1.6.7,<2",
transformers = "^4.46.2" "datasets>=2.21,<3",
einops = "^0.8.0" ]
texttable = { version = "^1.6.7", optional = true } moe = [ "moe-kernels" ]
datasets = {version = "^2.21.0", optional = true} attention = [ "attention-kernels" ]
peft = {version = "^0.13.2", optional = true} marlin = [ "marlin-kernels" ]
torch = {version = "^2.4.1", optional = true} gen = [
scipy = "^1.13.1" "grpcio-tools>=1.69.0",
pillow = "^11.0.0" "mypy-protobuf>=3.6.0",
outlines= {version = "^0.1.3", optional = true} ]
prometheus-client = ">=0.20.0,<0.22"
py-cpuinfo = "^9.0.0"
compressed-tensors = {version = "^0.7.1", optional = true}
# Remove later, temporary workaround for outlines.
numpy = "^1.26.4"
[tool.uv.sources]
attention-kernels = [ attention-kernels = [
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
marlin-kernels = [ marlin-kernels = [
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
moe-kernels = [ moe-kernels = [
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
] ]
rich = "^13.8.1"
[tool.poetry.extras]
torch = ["torch"]
accelerate = ["accelerate"]
attention = ["attention-kernels"]
bnb = ["bitsandbytes"]
compressed-tensors = ["compressed-tensors"]
marlin = ["marlin-kernels"]
moe = ["moe-kernels"]
peft = ["peft"]
quantize = ["texttable", "datasets", "accelerate"]
outlines = ["outlines"]
[tool.poetry.group.dev.dependencies]
grpcio-tools = "^1.51.1"
pytest = "^7.3.0"
[[tool.poetry.source]]
name = "pytorch-gpu-src"
url = "https://download.pytorch.org/whl/cu121"
priority = "explicit"
[tool.pytest.ini_options] [tool.pytest.ini_options]
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"] markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
[build-system]
requires = [
"poetry-core>=1.0.0",
]
build-backend = "poetry.core.masonry.api"
[tool.isort] [tool.isort]
profile = "black" profile = "black"
[tool.setuptools]
py-modules = ["text-generation-server"]

3426
server/uv.lock Normal file

File diff suppressed because it is too large Load Diff