This commit is contained in:
Nicolas Patry 2025-02-06 00:57:22 +01:00
parent 3514d2dc8c
commit c8b0eddf79
No known key found for this signature in database
GPG Key ID: D2920555C90F704C
4 changed files with 218 additions and 36 deletions

View File

@ -195,52 +195,54 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
git \
&& rm -rf /var/lib/apt/lists/*
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
# Install flash-attention dependencies
# RUN pip install einops --no-cache-dir
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \
make gen-server && \
pip install -U pip uv && \
uv pip install -r requirements_cuda.txt && \
uv pip install -e . && \
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
. ./.venv/bin/activate && \
make gen-server
RUN cd server && \
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
. ./.venv/bin/activate && \
pwd && \
text-generation-server --help
# uv sync --frozen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
# mv ./.venv/lib/python3.11/site-packages/* /opt/conda/lib/python3.11/site-packages/
# uv pip install nvidia-nccl-cu12==2.22.3
# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from lorax punica kernels builder
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries

View File

@ -15,8 +15,6 @@ unit-tests:
gen-server:
# Compile protos
pip install -U pip uv
uv pip install ".[gen]"
mkdir text_generation_server/pb || true
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@ -37,6 +35,7 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
export-requirements:
uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11

180
server/requirements_gen.txt Normal file
View File

@ -0,0 +1,180 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
certifi==2025.1.31
# via requests
charset-normalizer==3.4.1
# via requests
click==8.1.8
# via typer
deprecated==1.2.18
# via
# opentelemetry-api
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
# opentelemetry-semantic-conventions
einops==0.8.0
# via text-generation-server (pyproject.toml)
filelock==3.17.0
# via
# huggingface-hub
# transformers
fsspec==2025.2.0
# via huggingface-hub
googleapis-common-protos==1.66.0
# via
# grpcio-status
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
# via text-generation-server (pyproject.toml)
grpcio==1.70.0
# via
# text-generation-server (pyproject.toml)
# grpc-interceptor
# grpcio-reflection
# grpcio-status
# grpcio-tools
# opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.70.0
# via text-generation-server (pyproject.toml)
grpcio-status==1.70.0
# via text-generation-server (pyproject.toml)
grpcio-tools==1.70.0
# via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
# via text-generation-server (pyproject.toml)
huggingface-hub==0.28.1
# via
# tokenizers
# transformers
idna==3.10
# via requests
importlib-metadata==8.5.0
# via opentelemetry-api
loguru==0.7.3
# via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
mypy-protobuf==3.6.0
# via text-generation-server (pyproject.toml)
numpy==2.2.2
# via
# text-generation-server (pyproject.toml)
# scipy
# transformers
opentelemetry-api==1.30.0
# via
# text-generation-server (pyproject.toml)
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
# opentelemetry-instrumentation
# opentelemetry-instrumentation-grpc
# opentelemetry-sdk
# opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.30.0
# via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.30.0
# via
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.30.0
# via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.30.0
# via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.51b0
# via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.51b0
# via text-generation-server (pyproject.toml)
opentelemetry-proto==1.30.0
# via
# opentelemetry-exporter-otlp-proto-common
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.30.0
# via
# opentelemetry-exporter-otlp-proto-grpc
# opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.51b0
# via
# opentelemetry-instrumentation
# opentelemetry-instrumentation-grpc
# opentelemetry-sdk
packaging==24.2
# via
# huggingface-hub
# opentelemetry-instrumentation
# transformers
pillow==11.1.0
# via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
# via text-generation-server (pyproject.toml)
protobuf==5.29.3
# via
# text-generation-server (pyproject.toml)
# googleapis-common-protos
# grpcio-reflection
# grpcio-status
# grpcio-tools
# mypy-protobuf
# opentelemetry-proto
py-cpuinfo==9.0.0
# via text-generation-server (pyproject.toml)
pygments==2.19.1
# via rich
pyyaml==6.0.2
# via
# huggingface-hub
# transformers
regex==2024.11.6
# via transformers
requests==2.32.3
# via
# huggingface-hub
# opentelemetry-exporter-otlp-proto-http
# transformers
rich==13.9.4
# via
# text-generation-server (pyproject.toml)
# typer
safetensors==0.5.2
# via
# text-generation-server (pyproject.toml)
# transformers
scipy==1.15.1
# via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
# via text-generation-server (pyproject.toml)
setuptools==75.8.0
# via grpcio-tools
shellingham==1.5.4
# via typer
tokenizers==0.21.0
# via
# text-generation-server (pyproject.toml)
# transformers
tqdm==4.67.1
# via
# huggingface-hub
# transformers
transformers==4.48.2
# via text-generation-server (pyproject.toml)
typer==0.15.1
# via text-generation-server (pyproject.toml)
types-protobuf==5.29.1.20241207
# via mypy-protobuf
typing-extensions==4.12.2
# via
# huggingface-hub
# opentelemetry-sdk
# typer
urllib3==2.3.0
# via requests
wrapt==1.17.2
# via
# deprecated
# opentelemetry-instrumentation
# opentelemetry-instrumentation-grpc
zipp==3.21.0
# via importlib-metadata

View File

@ -2,4 +2,5 @@
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
source ./server/.venv/bin/activate
exec text-generation-launcher $@