diff --git a/Dockerfile b/Dockerfile index a963db2f..6bdefe9c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -206,11 +206,13 @@ COPY proto proto COPY server server COPY server/Makefile server/Makefile ENV UV_SYSTEM_PYTHON=1 +ENV HF_KERNELS_CACHE=/kernels RUN cd server && \ pip install -U pip uv && \ uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \ . ./.venv/bin/activate && \ - make gen-server-raw + make gen-server-raw \ + hf-kernels download . RUN cd server && \ uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \ diff --git a/server/pyproject.toml b/server/pyproject.toml index f7c9d175..18fe9753 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -14,6 +14,8 @@ dependencies = [ "grpcio>=1.67.0", "grpcio-reflection>=1.67.0", "grpcio-status>=1.67.0", + #"hf-kernels>=0.1.3", + "hf-kernels@git+https://github.com/huggingface/hf-kernels.git@hub-cache-env", "hf-transfer>=0.1.8", "loguru>=0.7.3", "numpy>=1.26,<3", @@ -34,7 +36,7 @@ dependencies = [ ] [build-system] -requires = ["hf-kernels", "setuptools"] +requires = ["hf-kernels>=0.1.2", "setuptools"] build-backend = "setuptools.build_meta" [tool.kernels.dependencies] @@ -69,7 +71,6 @@ quantize = [ "texttable>=1.6.7,<2", "datasets>=2.21,<3", ] -attention = [ "attention-kernels" ] gen = [ "grpcio-tools>=1.69.0", "mypy-protobuf>=3.6.0", diff --git a/server/uv.lock b/server/uv.lock index 5410a58c..cc78afd0 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -168,20 +168,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 }, ] -[[package]] -name = "attention-kernels" -version = "0.2.0.post2" -source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" } -dependencies = [ - { name = "torch" }, -] -wheels = [ - { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:863e02dda4b30e9d04ef6cf4d17d16c154f54bdcb8a8b87b8b46075eabf62d25" }, -] - -[package.metadata] -requires-dist = [{ name = "torch" }] - [[package]] name = "attrs" version = "24.3.0" @@ -676,6 +662,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/51/f6b198152399d17247d962340947728fb1b06da6bc0c0a542446b2ffee49/grpcio_tools-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d47abf7e0662dd5dbb9cc252c3616e5fbc5f71d34e3f6332cd24bcdf2940abd", size = 1114931 }, ] +[[package]] +name = "hf-kernels" +version = "0.1.3" +source = { git = "https://github.com/huggingface/hf-kernels.git?rev=hub-cache-env#a145c75733809d4426e4b716ff201867ac6452bc" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "packaging" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "torch" }, +] + [[package]] name = "hf-transfer" version = "0.1.9" @@ -906,86 +903,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506 }, ] -[[package]] -name = "marlin-kernels" -version = "0.3.7" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.13'", -] -dependencies = [ - { name = "torch", marker = "python_full_version >= '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef23668c4780f5053ea654999704a0195aadc651631b740d/marlin-kernels-0.3.7.tar.gz", hash = "sha256:8be8a65fd9ae21b2406afba9e460e3922582479b85a1372096e87e3a15684a77", size = 15662 } - -[[package]] -name = "marlin-kernels" -version = "0.3.7" -source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" } -resolution-markers = [ - "python_full_version == '3.10.*'", -] -dependencies = [ - { name = "torch", marker = "python_full_version == '3.10.*'" }, -] -wheels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:dd91a4e2c3b5e954833c5c34b0322e4c02cd92a967eb94654b6bbcece131340b" }, -] - -[package.metadata] -requires-dist = [{ name = "torch" }] - -[[package]] -name = "marlin-kernels" -version = "0.3.7" -source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" } -resolution-markers = [ - "python_full_version == '3.11.*'", -] -dependencies = [ - { name = "torch", marker = "python_full_version == '3.11.*'" }, -] -wheels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:b24d92135fbd156c55ce43158ab4a90fa880ba0df965528895cf1870b03a64bf" }, -] - -[package.metadata] -requires-dist = [{ name = "torch" }] - -[[package]] -name = "marlin-kernels" -version = "0.3.7" -source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" } -resolution-markers = [ - "python_full_version == '3.12.*'", -] -dependencies = [ - { name = "torch", marker = "python_full_version == '3.12.*'" }, -] -wheels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:8a407f1435a571a8d4ca3b9f533da83fde323043a9836b739cf8018c77782d49" }, -] - -[package.metadata] -requires-dist = [{ name = "torch" }] - -[[package]] -name = "marlin-kernels" -version = "0.3.7" -source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "torch", marker = "python_full_version < '3.10'" }, -] -wheels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:bf7003753c364c504b3998fffdfcf619a42ab04f908903dbad8d54347b6b142b" }, -] - -[package.metadata] -requires-dist = [{ name = "torch" }] - [[package]] name = "mdurl" version = "0.1.2" @@ -995,26 +912,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, ] -[[package]] -name = "moe-kernels" -version = "0.8.2" -source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" } -dependencies = [ - { name = "nvidia-ml-py" }, - { name = "torch" }, - { name = "triton" }, -] -wheels = [ - { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:1ed5b26f52339d25ea2513e99e8b6239cf1921af3eac54e03a46bb8f8efb380b" }, -] - -[package.metadata] -requires-dist = [ - { name = "nvidia-ml-py" }, - { name = "torch" }, - { name = "triton" }, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -1389,15 +1286,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 }, ] -[[package]] -name = "nvidia-ml-py" -version = "12.560.30" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/53/10/5f34de4a71db8b2b7ec4269f4a33287f24c23e2857ea3187c977b7bc3604/nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97", size = 39194 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/f3/a69ce0b1a1e12fbf6b2ad9f4c14c9999fdbdf15f2478d210f0fd501ddc98/nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73", size = 40526 }, -] - [[package]] name = "nvidia-nccl-cu12" version = "2.21.5" @@ -2653,6 +2541,7 @@ dependencies = [ { name = "grpcio" }, { name = "grpcio-reflection" }, { name = "grpcio-status" }, + { name = "hf-kernels" }, { name = "hf-transfer" }, { name = "loguru" }, { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -2678,9 +2567,6 @@ dependencies = [ accelerate = [ { name = "accelerate" }, ] -attention = [ - { name = "attention-kernels" }, -] bnb = [ { name = "bitsandbytes" }, ] @@ -2695,16 +2581,6 @@ gen = [ { name = "grpcio-tools" }, { name = "mypy-protobuf" }, ] -marlin = [ - { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, - { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" }, - { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" }, - { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" }, - { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" }, -] -moe = [ - { name = "moe-kernels" }, -] outlines = [ { name = "outlines" }, ] @@ -2719,7 +2595,6 @@ quantize = [ [package.metadata] requires-dist = [ { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" }, - { name = "attention-kernels", marker = "extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }, { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" }, { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" }, { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" }, @@ -2730,14 +2605,9 @@ requires-dist = [ { name = "grpcio-status", specifier = ">=1.67.0" }, { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" }, { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" }, + { name = "hf-kernels", git = "https://github.com/huggingface/hf-kernels.git?rev=hub-cache-env" }, { name = "hf-transfer", specifier = ">=0.1.8" }, { name = "loguru", specifier = ">=0.7.3" }, - { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, - { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" }, - { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, - { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, - { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, - { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }, { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" }, { name = "numpy", specifier = ">=1.26,<3" }, { name = "opentelemetry-api", specifier = ">=1.27.0" }, @@ -2919,7 +2789,7 @@ name = "triton" version = "3.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, + { name = "filelock", marker = "python_full_version < '3.13'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },