mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 13:52:07 +00:00
Moving to uv
instead of poetry
. (#2919)
* Moving to `uv` instead of `poetry`. More in the standard, faster, seemingly better lockfile. * Creating venv if not created. * Create the venv. * Fix ? * Fixing the test by activating the environment ? * Install system ? * Add the cli entry point. * docker install on system * Monkeying this... * `--system` is redundant. * Trying to force-include this pb folder. * TRying to check that pb is imported correctly. * Editable install necessary ? * Non editable? * Editable it is.
This commit is contained in:
parent
d61f14f271
commit
de19e7e844
6
.github/workflows/tests.yaml
vendored
6
.github/workflows/tests.yaml
vendored
@ -44,10 +44,14 @@ jobs:
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install python3.11-dev -y
|
||||
pip install -U pip uv
|
||||
uv venv
|
||||
source ./.venv/bin/activate
|
||||
make install-cpu
|
||||
- name: Run server tests
|
||||
run: |
|
||||
pip install pytest
|
||||
source ./.venv/bin/activate
|
||||
uv pip install pytest
|
||||
export HF_TOKEN=${{ secrets.HF_TOKEN }}
|
||||
pytest -s -vv server/tests
|
||||
- name: Pre-commit checks
|
||||
|
10
Dockerfile
10
Dockerfile
@ -224,17 +224,19 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
|
||||
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
|
||||
|
||||
# Install flash-attention dependencies
|
||||
RUN pip install einops --no-cache-dir
|
||||
# RUN pip install einops --no-cache-dir
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_cuda.txt && \
|
||||
pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
python -c "from text_generation_server.pb import generate_pb2" && \
|
||||
pip install -U pip uv && \
|
||||
uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
uv pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
# Required to find libpython within the rust binaries
|
||||
|
@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
|
||||
/opt/conda/bin/conda clean -ya
|
||||
|
||||
# Install flash-attention, torch dependencies
|
||||
RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
|
||||
RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN conda install mkl=2021
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
|
||||
@ -318,10 +318,11 @@ COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_rocm.txt && \
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
pip install -U pip uv && \
|
||||
uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
|
@ -108,10 +108,11 @@ RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_intel.txt && \
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
pip install -U pip uv && \
|
||||
uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
|
||||
ENV CCL_ZE_IPC_EXCHANGE=sockets
|
||||
@ -211,10 +212,11 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_intel.txt && \
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
pip install -U pip uv && \
|
||||
uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
|
@ -853,11 +853,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1736907983,
|
||||
"narHash": "sha256-fw55wVwpJW36Md2HZBKuxX3YHGeqsGsspPLtCMVr1Y8=",
|
||||
"lastModified": 1736994333,
|
||||
"narHash": "sha256-v4Jrok5yXsZ6dwj2+2uo5cSyUi9fBTurHqHvNHLT1XA=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "eaa365c911441e07e387ff6acc596619fc50b156",
|
||||
"rev": "848db855cb9e88785996e961951659570fc58814",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -9,11 +9,14 @@ include Makefile-exllamav2
|
||||
include Makefile-flashinfer
|
||||
|
||||
unit-tests:
|
||||
pip install -U pip uv
|
||||
uv pip install -e ".[dev]"
|
||||
pytest -s -vv -m "not private" tests
|
||||
|
||||
gen-server:
|
||||
# Compile protos
|
||||
pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
|
||||
pip install -U pip uv
|
||||
uv pip install ".[gen]"
|
||||
mkdir text_generation_server/pb || true
|
||||
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
|
||||
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
|
||||
@ -21,24 +24,14 @@ gen-server:
|
||||
touch text_generation_server/pb/__init__.py
|
||||
|
||||
install-server: gen-server
|
||||
pip install pip --upgrade
|
||||
pip install -r requirements_cuda.txt
|
||||
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
|
||||
uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
|
||||
|
||||
|
||||
install: install-cuda
|
||||
echo "Installed server"
|
||||
|
||||
install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
|
||||
pip install -e ".[attention,bnb,marlin,moe]"
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
uv pip install -e ".[attention,bnb,marlin,moe]"
|
||||
uv pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
|
||||
|
||||
run-dev:
|
||||
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
|
||||
|
||||
export-requirements:
|
||||
poetry export -o requirements_cuda.txt --without-hashes
|
||||
poetry export -o requirements_rocm.txt --without-hashes
|
||||
poetry export -o requirements_intel.txt --without-hashes
|
||||
|
4101
server/poetry.lock
generated
4101
server/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,96 +1,97 @@
|
||||
[tool.poetry]
|
||||
[project]
|
||||
name = "text-generation-server"
|
||||
version = "2.0.5-dev0"
|
||||
description = "Text Generation Inference Python gRPC Server"
|
||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
authors = [
|
||||
{name = "Olivier Dehaene", email = "olivier@huggingface.co"},
|
||||
{name = "Nicolas Patry", email = "nicolas@huggingface.co"},
|
||||
]
|
||||
dependencies = [
|
||||
"einops>=0.8.0",
|
||||
"grpc-interceptor>=0.15.4",
|
||||
"grpcio>=1.67.0",
|
||||
"grpcio-reflection>=1.67.0",
|
||||
"grpcio-status>=1.67.0",
|
||||
"hf-transfer>=0.1.8",
|
||||
"loguru>=0.7.3",
|
||||
"numpy>=1.26,<3",
|
||||
"opentelemetry-api>=1.27.0",
|
||||
"opentelemetry-exporter-otlp>=1.27.0",
|
||||
"opentelemetry-instrumentation-grpc>=0.50b0",
|
||||
"pillow>=11.1.0",
|
||||
"prometheus-client>=0.21.0",
|
||||
"protobuf>=5.28.3",
|
||||
"py-cpuinfo>=9.0.0",
|
||||
"rich>=13.8.1",
|
||||
"safetensors>=0.4.5",
|
||||
"scipy>=1.13.1",
|
||||
"sentencepiece>=0.2.0",
|
||||
"tokenizers>=0.20.3",
|
||||
"typer>=0.15.1",
|
||||
]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
text-generation-server = 'text_generation_server.cli:app'
|
||||
[project.scripts]
|
||||
text-generation-server = "text_generation_server.cli:app"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<3.13"
|
||||
protobuf = ">=4.25.3,<6"
|
||||
grpcio = "^1.51.1"
|
||||
grpcio-status = "^1.51.1"
|
||||
grpcio-reflection = "^1.51.1"
|
||||
grpc-interceptor = "^0.15.4"
|
||||
typer = "^0.12.5"
|
||||
accelerate = {version = "^1.1.0", optional = true}
|
||||
bitsandbytes = { version = "^0.45.0", optional = true }
|
||||
safetensors = "^0.4.5"
|
||||
loguru = "^0.7.2"
|
||||
opentelemetry-api = "^1.27.0"
|
||||
opentelemetry-exporter-otlp = "^1.27.0"
|
||||
opentelemetry-instrumentation-grpc = "^0.48b0"
|
||||
hf-transfer = "^0.1.2"
|
||||
sentencepiece = "^0.2.0"
|
||||
tokenizers = "^0.20.3"
|
||||
huggingface-hub = "^0.23"
|
||||
transformers = "^4.46.2"
|
||||
einops = "^0.8.0"
|
||||
texttable = { version = "^1.6.7", optional = true }
|
||||
datasets = {version = "^2.21.0", optional = true}
|
||||
peft = {version = "^0.13.2", optional = true}
|
||||
torch = {version = "^2.4.1", optional = true}
|
||||
scipy = "^1.13.1"
|
||||
pillow = "^11.0.0"
|
||||
outlines= {version = "^0.1.3", optional = true}
|
||||
prometheus-client = ">=0.20.0,<0.22"
|
||||
py-cpuinfo = "^9.0.0"
|
||||
compressed-tensors = {version = "^0.7.1", optional = true}
|
||||
# Remove later, temporary workaround for outlines.
|
||||
numpy = "^1.26.4"
|
||||
[project.optional-dependencies]
|
||||
accelerate = [
|
||||
"accelerate>=1.2.1,<2",
|
||||
]
|
||||
bnb = [
|
||||
"bitsandbytes>=0.45.0",
|
||||
]
|
||||
compressed-tensors = [
|
||||
"compressed-tensors>=0.9.0",
|
||||
]
|
||||
peft = [
|
||||
"peft>=0.14.0",
|
||||
]
|
||||
outlines = [
|
||||
"outlines>=0.1.13",
|
||||
]
|
||||
dev = [
|
||||
"grpcio-tools>=1.51.1,<2.0",
|
||||
"pytest>=7.3.0,<8"
|
||||
]
|
||||
quantize = [
|
||||
"texttable>=1.6.7,<2",
|
||||
"datasets>=2.21,<3",
|
||||
]
|
||||
moe = [ "moe-kernels" ]
|
||||
attention = [ "attention-kernels" ]
|
||||
marlin = [ "marlin-kernels" ]
|
||||
gen = [
|
||||
"grpcio-tools>=1.69.0",
|
||||
"mypy-protobuf>=3.6.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
attention-kernels = [
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
]
|
||||
marlin-kernels = [
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
]
|
||||
moe-kernels = [
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
]
|
||||
rich = "^13.8.1"
|
||||
|
||||
[tool.poetry.extras]
|
||||
torch = ["torch"]
|
||||
accelerate = ["accelerate"]
|
||||
attention = ["attention-kernels"]
|
||||
bnb = ["bitsandbytes"]
|
||||
compressed-tensors = ["compressed-tensors"]
|
||||
marlin = ["marlin-kernels"]
|
||||
moe = ["moe-kernels"]
|
||||
peft = ["peft"]
|
||||
quantize = ["texttable", "datasets", "accelerate"]
|
||||
outlines = ["outlines"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
grpcio-tools = "^1.51.1"
|
||||
pytest = "^7.3.0"
|
||||
|
||||
|
||||
[[tool.poetry.source]]
|
||||
name = "pytorch-gpu-src"
|
||||
url = "https://download.pytorch.org/whl/cu121"
|
||||
priority = "explicit"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
|
||||
|
||||
[build-system]
|
||||
requires = [
|
||||
"poetry-core>=1.0.0",
|
||||
]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["text_generation_server*"]
|
||||
|
3426
server/uv.lock
Normal file
3426
server/uv.lock
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user