Cache the kernels in the Docker image

2025-09-11 04:14:52 +00:00 · 2025-02-03 11:57:36 +00:00 · 2025-02-03 11:57:36 +00:00 · c9191f3f2b
commit c9191f3f2b
parent b267caa537
3 changed files with 20 additions and 147 deletions
--- a/4
+++ b/4
@ -206,11 +206,13 @@ COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
+ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
    pip install -U pip uv && \
 	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
    . ./.venv/bin/activate && \
-    make gen-server-raw
+    make gen-server-raw \
+    hf-kernels download .

 RUN cd server && \
    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -14,6 +14,8 @@ dependencies = [
    "grpcio>=1.67.0",
    "grpcio-reflection>=1.67.0",
    "grpcio-status>=1.67.0",
+    #"hf-kernels>=0.1.3",
+    "hf-kernels@git+https://github.com/huggingface/hf-kernels.git@hub-cache-env",
    "hf-transfer>=0.1.8",
    "loguru>=0.7.3",
    "numpy>=1.26,<3",
@ -34,7 +36,7 @@ dependencies = [
 ]

 [build-system]
-requires = ["hf-kernels", "setuptools"]
+requires = ["hf-kernels>=0.1.2", "setuptools"]
 build-backend = "setuptools.build_meta"

 [tool.kernels.dependencies]
@ -69,7 +71,6 @@ quantize = [
    "texttable>=1.6.7,<2",
    "datasets>=2.21,<3",
 ]
-attention = [ "attention-kernels" ]
 gen = [
    "grpcio-tools>=1.69.0",
    "mypy-protobuf>=3.6.0",
--- a/server/uv.lock
+++ b/server/uv.lock
@ -168,20 +168,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
 ]

-[[package]]
-name = "attention-kernels"
-version = "0.2.0.post2"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
-dependencies = [
-    { name = "torch" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:863e02dda4b30e9d04ef6cf4d17d16c154f54bdcb8a8b87b8b46075eabf62d25" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
 [[package]]
 name = "attrs"
 version = "24.3.0"
@ -676,6 +662,17 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/64/51/f6b198152399d17247d962340947728fb1b06da6bc0c0a542446b2ffee49/grpcio_tools-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d47abf7e0662dd5dbb9cc252c3616e5fbc5f71d34e3f6332cd24bcdf2940abd", size = 1114931 },
 ]

+[[package]]
+name = "hf-kernels"
+version = "0.1.3"
+source = { git = "https://github.com/huggingface/hf-kernels.git?rev=hub-cache-env#a145c75733809d4426e4b716ff201867ac6452bc" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "packaging" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "torch" },
+]
+
 [[package]]
 name = "hf-transfer"
 version = "0.1.9"
@ -906,86 +903,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506 },
 ]

-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef23668c4780f5053ea654999704a0195aadc651631b740d/marlin-kernels-0.3.7.tar.gz", hash = "sha256:8be8a65fd9ae21b2406afba9e460e3922582479b85a1372096e87e3a15684a77", size = 15662 }
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.10.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:dd91a4e2c3b5e954833c5c34b0322e4c02cd92a967eb94654b6bbcece131340b" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.11.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.11.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:b24d92135fbd156c55ce43158ab4a90fa880ba0df965528895cf1870b03a64bf" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.12.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.12.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:8a407f1435a571a8d4ca3b9f533da83fde323043a9836b739cf8018c77782d49" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:bf7003753c364c504b3998fffdfcf619a42ab04f908903dbad8d54347b6b142b" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@ -995,26 +912,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]

-[[package]]
-name = "moe-kernels"
-version = "0.8.2"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
-dependencies = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:1ed5b26f52339d25ea2513e99e8b6239cf1921af3eac54e03a46bb8f8efb380b" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@ -1389,15 +1286,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]

-[[package]]
-name = "nvidia-ml-py"
-version = "12.560.30"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/53/10/5f34de4a71db8b2b7ec4269f4a33287f24c23e2857ea3187c977b7bc3604/nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97", size = 39194 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/f3/a69ce0b1a1e12fbf6b2ad9f4c14c9999fdbdf15f2478d210f0fd501ddc98/nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73", size = 40526 },
-]
-
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@ -2653,6 +2541,7 @@ dependencies = [
    { name = "grpcio" },
    { name = "grpcio-reflection" },
    { name = "grpcio-status" },
+    { name = "hf-kernels" },
    { name = "hf-transfer" },
    { name = "loguru" },
    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@ -2678,9 +2567,6 @@ dependencies = [
 accelerate = [
    { name = "accelerate" },
 ]
-attention = [
-    { name = "attention-kernels" },
-]
 bnb = [
    { name = "bitsandbytes" },
 ]
@ -2695,16 +2581,6 @@ gen = [
    { name = "grpcio-tools" },
    { name = "mypy-protobuf" },
 ]
-marlin = [
-    { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
-]
-moe = [
-    { name = "moe-kernels" },
-]
 outlines = [
    { name = "outlines" },
 ]
@ -2719,7 +2595,6 @@ quantize = [
 [package.metadata]
 requires-dist = [
    { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" },
-    { name = "attention-kernels", marker = "extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
    { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" },
    { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" },
    { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" },
@ -2730,14 +2605,9 @@ requires-dist = [
    { name = "grpcio-status", specifier = ">=1.67.0" },
    { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
    { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
+    { name = "hf-kernels", git = "https://github.com/huggingface/hf-kernels.git?rev=hub-cache-env" },
    { name = "hf-transfer", specifier = ">=0.1.8" },
    { name = "loguru", specifier = ">=0.7.3" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
    { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
    { name = "numpy", specifier = ">=1.26,<3" },
    { name = "opentelemetry-api", specifier = ">=1.27.0" },
@ -2919,7 +2789,7 @@ name = "triton"
 version = "3.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock" },
+    { name = "filelock", marker = "python_full_version < '3.13'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },