From c8b0eddf79b578d9173765feb8f5433905a800d2 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 6 Feb 2025 00:57:22 +0100
Subject: [PATCH] Attempt #42

---
 Dockerfile                  |  70 +++++++-------
 server/Makefile             |   3 +-
 server/requirements_gen.txt | 180 ++++++++++++++++++++++++++++++++++++
 tgi-entrypoint.sh           |   1 +
 4 files changed, 218 insertions(+), 36 deletions(-)
 create mode 100644 server/requirements_gen.txt

diff --git a/Dockerfile b/Dockerfile
index 6ccc2e3a..b6618f77 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -195,52 +195,54 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         git \
         && rm -rf /var/lib/apt/lists/*
 
-# Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
-
-# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
-
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir
 
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
     pip install -U pip uv && \
-    uv pip install -r requirements_cuda.txt && \
-    uv pip install -e . && \
+	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    make gen-server
+
+RUN cd server && \
+    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
     text-generation-server --help
 
-    # uv sync --frozen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
-    # mv ./.venv/lib/python3.11/site-packages/* /opt/conda/lib/python3.11/site-packages/
-    # uv pip install nvidia-nccl-cu12==2.22.3
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from lorax punica kernels builder
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
+
 
 # ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
diff --git a/server/Makefile b/server/Makefile
index f80daed4..a97ef390 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -15,8 +15,6 @@ unit-tests:
 
 gen-server:
 	# Compile protos
-	pip install -U pip uv
-	uv pip install ".[gen]"
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@@ -37,6 +35,7 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
 
 export-requirements:
+	uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
 	uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
 	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
 	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
diff --git a/server/requirements_gen.txt b/server/requirements_gen.txt
new file mode 100644
index 00000000..d9836ad7
--- /dev/null
+++ b/server/requirements_gen.txt
@@ -0,0 +1,180 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via typer
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   transformers
+fsspec==2025.2.0
+    # via huggingface-hub
+googleapis-common-protos==1.66.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.70.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-tools==1.70.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.9
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via requests
+importlib-metadata==8.5.0
+    # via opentelemetry-api
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+mypy-protobuf==3.6.0
+    # via text-generation-server (pyproject.toml)
+numpy==2.2.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   scipy
+    #   transformers
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   transformers
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   mypy-protobuf
+    #   opentelemetry-proto
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pygments==2.19.1
+    # via rich
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+safetensors==0.5.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+scipy==1.15.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+setuptools==75.8.0
+    # via grpcio-tools
+shellingham==1.5.4
+    # via typer
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers==4.48.2
+    # via text-generation-server (pyproject.toml)
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+types-protobuf==5.29.1.20241207
+    # via mypy-protobuf
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   typer
+urllib3==2.3.0
+    # via requests
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+zipp==3.21.0
+    # via importlib-metadata
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index 278c7d96..94ea9436 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,4 +2,5 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
+source ./server/.venv/bin/activate
 exec text-generation-launcher $@