diff --git a/Dockerfile b/Dockerfile index 6ccc2e3a..b6618f77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -195,52 +195,54 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins git \ && rm -rf /var/lib/apt/lists/* -# Copy conda with PyTorch installed -COPY --from=pytorch-install /opt/conda /opt/conda - -# Copy build artifacts from flash attention builder -COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages - -# Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages - -# Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from awq kernels builder -COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from eetq kernels builder -COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from lorax punica kernels builder -COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages -# Copy build artifacts from mamba builder -COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages -COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages -COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/ - # Install flash-attention dependencies # RUN pip install einops --no-cache-dir +# Copy conda with PyTorch installed +COPY --from=pytorch-install /opt/conda /opt/conda + # Install server COPY proto proto COPY server server COPY server/Makefile server/Makefile ENV UV_SYSTEM_PYTHON=1 RUN cd server && \ - make gen-server && \ pip install -U pip uv && \ - uv pip install -r requirements_cuda.txt && \ - uv pip install -e . && \ + uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \ + . ./.venv/bin/activate && \ + make gen-server + +RUN cd server && \ + uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \ + . ./.venv/bin/activate && \ + pwd && \ text-generation-server --help - # uv sync --frozen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \ - # mv ./.venv/lib/python3.11/site-packages/* /opt/conda/lib/python3.11/site-packages/ - # uv pip install nvidia-nccl-cu12==2.22.3 +# Copy build artifacts from flash attention builder +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages + +# Copy build artifacts from flash attention v2 builder +COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages + +# Copy build artifacts from custom kernels builder +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from exllama kernels builder +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from exllamav2 kernels builder +COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from awq kernels builder +COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from eetq kernels builder +COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from lorax punica kernels builder +COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages +# Copy build artifacts from mamba builder +COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages +COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages +COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/ + # ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 # Required to find libpython within the rust binaries diff --git a/server/Makefile b/server/Makefile index f80daed4..a97ef390 100644 --- a/server/Makefile +++ b/server/Makefile @@ -15,8 +15,6 @@ unit-tests: gen-server: # Compile protos - pip install -U pip uv - uv pip install ".[gen]" mkdir text_generation_server/pb || true python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \ --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto @@ -37,6 +35,7 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm export-requirements: + uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11 uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11 uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11 uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11 diff --git a/server/requirements_gen.txt b/server/requirements_gen.txt new file mode 100644 index 00000000..d9836ad7 --- /dev/null +++ b/server/requirements_gen.txt @@ -0,0 +1,180 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11 +certifi==2025.1.31 + # via requests +charset-normalizer==3.4.1 + # via requests +click==8.1.8 + # via typer +deprecated==1.2.18 + # via + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-semantic-conventions +einops==0.8.0 + # via text-generation-server (pyproject.toml) +filelock==3.17.0 + # via + # huggingface-hub + # transformers +fsspec==2025.2.0 + # via huggingface-hub +googleapis-common-protos==1.66.0 + # via + # grpcio-status + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +grpc-interceptor==0.15.4 + # via text-generation-server (pyproject.toml) +grpcio==1.70.0 + # via + # text-generation-server (pyproject.toml) + # grpc-interceptor + # grpcio-reflection + # grpcio-status + # grpcio-tools + # opentelemetry-exporter-otlp-proto-grpc +grpcio-reflection==1.70.0 + # via text-generation-server (pyproject.toml) +grpcio-status==1.70.0 + # via text-generation-server (pyproject.toml) +grpcio-tools==1.70.0 + # via text-generation-server (pyproject.toml) +hf-transfer==0.1.9 + # via text-generation-server (pyproject.toml) +huggingface-hub==0.28.1 + # via + # tokenizers + # transformers +idna==3.10 + # via requests +importlib-metadata==8.5.0 + # via opentelemetry-api +loguru==0.7.3 + # via text-generation-server (pyproject.toml) +markdown-it-py==3.0.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +mypy-protobuf==3.6.0 + # via text-generation-server (pyproject.toml) +numpy==2.2.2 + # via + # text-generation-server (pyproject.toml) + # scipy + # transformers +opentelemetry-api==1.30.0 + # via + # text-generation-server (pyproject.toml) + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-grpc + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp==1.30.0 + # via text-generation-server (pyproject.toml) +opentelemetry-exporter-otlp-proto-common==1.30.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.30.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.30.0 + # via opentelemetry-exporter-otlp +opentelemetry-instrumentation==0.51b0 + # via opentelemetry-instrumentation-grpc +opentelemetry-instrumentation-grpc==0.51b0 + # via text-generation-server (pyproject.toml) +opentelemetry-proto==1.30.0 + # via + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.30.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.51b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-grpc + # opentelemetry-sdk +packaging==24.2 + # via + # huggingface-hub + # opentelemetry-instrumentation + # transformers +pillow==11.1.0 + # via text-generation-server (pyproject.toml) +prometheus-client==0.21.1 + # via text-generation-server (pyproject.toml) +protobuf==5.29.3 + # via + # text-generation-server (pyproject.toml) + # googleapis-common-protos + # grpcio-reflection + # grpcio-status + # grpcio-tools + # mypy-protobuf + # opentelemetry-proto +py-cpuinfo==9.0.0 + # via text-generation-server (pyproject.toml) +pygments==2.19.1 + # via rich +pyyaml==6.0.2 + # via + # huggingface-hub + # transformers +regex==2024.11.6 + # via transformers +requests==2.32.3 + # via + # huggingface-hub + # opentelemetry-exporter-otlp-proto-http + # transformers +rich==13.9.4 + # via + # text-generation-server (pyproject.toml) + # typer +safetensors==0.5.2 + # via + # text-generation-server (pyproject.toml) + # transformers +scipy==1.15.1 + # via text-generation-server (pyproject.toml) +sentencepiece==0.2.0 + # via text-generation-server (pyproject.toml) +setuptools==75.8.0 + # via grpcio-tools +shellingham==1.5.4 + # via typer +tokenizers==0.21.0 + # via + # text-generation-server (pyproject.toml) + # transformers +tqdm==4.67.1 + # via + # huggingface-hub + # transformers +transformers==4.48.2 + # via text-generation-server (pyproject.toml) +typer==0.15.1 + # via text-generation-server (pyproject.toml) +types-protobuf==5.29.1.20241207 + # via mypy-protobuf +typing-extensions==4.12.2 + # via + # huggingface-hub + # opentelemetry-sdk + # typer +urllib3==2.3.0 + # via requests +wrapt==1.17.2 + # via + # deprecated + # opentelemetry-instrumentation + # opentelemetry-instrumentation-grpc +zipp==3.21.0 + # via importlib-metadata diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh index 278c7d96..94ea9436 100755 --- a/tgi-entrypoint.sh +++ b/tgi-entrypoint.sh @@ -2,4 +2,5 @@ ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases' +source ./server/.venv/bin/activate exec text-generation-launcher $@