mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Attempt #42
This commit is contained in:
parent
3514d2dc8c
commit
c8b0eddf79
70
Dockerfile
70
Dockerfile
@ -195,52 +195,54 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
||||
git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy conda with PyTorch installed
|
||||
COPY --from=pytorch-install /opt/conda /opt/conda
|
||||
|
||||
# Copy build artifacts from flash attention builder
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from flash attention v2 builder
|
||||
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from custom kernels builder
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllama kernels builder
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllamav2 kernels builder
|
||||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from awq kernels builder
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from eetq kernels builder
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from lorax punica kernels builder
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from mamba builder
|
||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
|
||||
|
||||
# Install flash-attention dependencies
|
||||
# RUN pip install einops --no-cache-dir
|
||||
|
||||
# Copy conda with PyTorch installed
|
||||
COPY --from=pytorch-install /opt/conda /opt/conda
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -U pip uv && \
|
||||
uv pip install -r requirements_cuda.txt && \
|
||||
uv pip install -e . && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
|
||||
. ./.venv/bin/activate && \
|
||||
make gen-server
|
||||
|
||||
RUN cd server && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
|
||||
. ./.venv/bin/activate && \
|
||||
pwd && \
|
||||
text-generation-server --help
|
||||
|
||||
# uv sync --frozen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
|
||||
# mv ./.venv/lib/python3.11/site-packages/* /opt/conda/lib/python3.11/site-packages/
|
||||
# uv pip install nvidia-nccl-cu12==2.22.3
|
||||
# Copy build artifacts from flash attention builder
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from flash attention v2 builder
|
||||
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from custom kernels builder
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllama kernels builder
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllamav2 kernels builder
|
||||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from awq kernels builder
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from eetq kernels builder
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from lorax punica kernels builder
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from mamba builder
|
||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
|
||||
|
||||
|
||||
# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
# Required to find libpython within the rust binaries
|
||||
|
@ -15,8 +15,6 @@ unit-tests:
|
||||
|
||||
gen-server:
|
||||
# Compile protos
|
||||
pip install -U pip uv
|
||||
uv pip install ".[gen]"
|
||||
mkdir text_generation_server/pb || true
|
||||
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
|
||||
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
|
||||
@ -37,6 +35,7 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten
|
||||
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
|
||||
|
||||
export-requirements:
|
||||
uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
|
||||
uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
|
||||
uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
|
||||
uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
|
||||
|
180
server/requirements_gen.txt
Normal file
180
server/requirements_gen.txt
Normal file
@ -0,0 +1,180 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
|
||||
certifi==2025.1.31
|
||||
# via requests
|
||||
charset-normalizer==3.4.1
|
||||
# via requests
|
||||
click==8.1.8
|
||||
# via typer
|
||||
deprecated==1.2.18
|
||||
# via
|
||||
# opentelemetry-api
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
# opentelemetry-semantic-conventions
|
||||
einops==0.8.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
filelock==3.17.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
fsspec==2025.2.0
|
||||
# via huggingface-hub
|
||||
googleapis-common-protos==1.66.0
|
||||
# via
|
||||
# grpcio-status
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
grpc-interceptor==0.15.4
|
||||
# via text-generation-server (pyproject.toml)
|
||||
grpcio==1.70.0
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# grpc-interceptor
|
||||
# grpcio-reflection
|
||||
# grpcio-status
|
||||
# grpcio-tools
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
grpcio-reflection==1.70.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
grpcio-status==1.70.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
grpcio-tools==1.70.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
hf-transfer==0.1.9
|
||||
# via text-generation-server (pyproject.toml)
|
||||
huggingface-hub==0.28.1
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
idna==3.10
|
||||
# via requests
|
||||
importlib-metadata==8.5.0
|
||||
# via opentelemetry-api
|
||||
loguru==0.7.3
|
||||
# via text-generation-server (pyproject.toml)
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mypy-protobuf==3.6.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
numpy==2.2.2
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# scipy
|
||||
# transformers
|
||||
opentelemetry-api==1.30.0
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
# opentelemetry-instrumentation
|
||||
# opentelemetry-instrumentation-grpc
|
||||
# opentelemetry-sdk
|
||||
# opentelemetry-semantic-conventions
|
||||
opentelemetry-exporter-otlp==1.30.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
opentelemetry-exporter-otlp-proto-common==1.30.0
|
||||
# via
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.30.0
|
||||
# via opentelemetry-exporter-otlp
|
||||
opentelemetry-exporter-otlp-proto-http==1.30.0
|
||||
# via opentelemetry-exporter-otlp
|
||||
opentelemetry-instrumentation==0.51b0
|
||||
# via opentelemetry-instrumentation-grpc
|
||||
opentelemetry-instrumentation-grpc==0.51b0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
opentelemetry-proto==1.30.0
|
||||
# via
|
||||
# opentelemetry-exporter-otlp-proto-common
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
opentelemetry-sdk==1.30.0
|
||||
# via
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
opentelemetry-semantic-conventions==0.51b0
|
||||
# via
|
||||
# opentelemetry-instrumentation
|
||||
# opentelemetry-instrumentation-grpc
|
||||
# opentelemetry-sdk
|
||||
packaging==24.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# opentelemetry-instrumentation
|
||||
# transformers
|
||||
pillow==11.1.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
prometheus-client==0.21.1
|
||||
# via text-generation-server (pyproject.toml)
|
||||
protobuf==5.29.3
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# googleapis-common-protos
|
||||
# grpcio-reflection
|
||||
# grpcio-status
|
||||
# grpcio-tools
|
||||
# mypy-protobuf
|
||||
# opentelemetry-proto
|
||||
py-cpuinfo==9.0.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
pygments==2.19.1
|
||||
# via rich
|
||||
pyyaml==6.0.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
regex==2024.11.6
|
||||
# via transformers
|
||||
requests==2.32.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# opentelemetry-exporter-otlp-proto-http
|
||||
# transformers
|
||||
rich==13.9.4
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# typer
|
||||
safetensors==0.5.2
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# transformers
|
||||
scipy==1.15.1
|
||||
# via text-generation-server (pyproject.toml)
|
||||
sentencepiece==0.2.0
|
||||
# via text-generation-server (pyproject.toml)
|
||||
setuptools==75.8.0
|
||||
# via grpcio-tools
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
tokenizers==0.21.0
|
||||
# via
|
||||
# text-generation-server (pyproject.toml)
|
||||
# transformers
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
transformers==4.48.2
|
||||
# via text-generation-server (pyproject.toml)
|
||||
typer==0.15.1
|
||||
# via text-generation-server (pyproject.toml)
|
||||
types-protobuf==5.29.1.20241207
|
||||
# via mypy-protobuf
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# opentelemetry-sdk
|
||||
# typer
|
||||
urllib3==2.3.0
|
||||
# via requests
|
||||
wrapt==1.17.2
|
||||
# via
|
||||
# deprecated
|
||||
# opentelemetry-instrumentation
|
||||
# opentelemetry-instrumentation-grpc
|
||||
zipp==3.21.0
|
||||
# via importlib-metadata
|
@ -2,4 +2,5 @@
|
||||
|
||||
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
|
||||
|
||||
source ./server/.venv/bin/activate
|
||||
exec text-generation-launcher $@
|
||||
|
Loading…
Reference in New Issue
Block a user