From 96a4d4d083e0ef71cfbaf84e45d5d4af35ed9ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 5 Feb 2025 15:39:04 +0000 Subject: [PATCH] attention -> paged-attention --- flake.lock | 6 +- nix/server.nix | 4 +- server/hf-kernels.lock | 222 +++++++++--------- server/pyproject.toml | 2 +- .../layers/attention/cuda.py | 10 +- .../layers/attention/kv_cache.py | 8 +- 6 files changed, 126 insertions(+), 126 deletions(-) diff --git a/flake.lock b/flake.lock index 78a35a6d..3a9d9c7c 100644 --- a/flake.lock +++ b/flake.lock @@ -978,11 +978,11 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1738752518, - "narHash": "sha256-+Cm517pJIgUJ2jMwQyR7qZ96u410eHMk3rTarHXkbug=", + "lastModified": 1738769628, + "narHash": "sha256-hgHf1mscFbH9XtT3dYtFQcxRfict9N+Vi6QSW1c+FjU=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "c9b5c8e48b96961125ada3075e21074844740fe1", + "rev": "9a5a58219dead9704d83d9d32f105b6b90bd31f2", "type": "github" }, "original": { diff --git a/nix/server.nix b/nix/server.nix index a7c8c799..b638449b 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -3,7 +3,6 @@ buildPythonPackage, poetry-core, mypy-protobuf, - attention, awq-inference-engine, causal-conv1d, compressed-tensors, @@ -29,6 +28,7 @@ opentelemetry-instrumentation-grpc, opentelemetry-semantic-conventions, outlines, + paged-attention, peft, pillow, prometheus-client, @@ -79,7 +79,6 @@ buildPythonPackage { pythonRemoveDeps = [ "scipy" ]; dependencies = [ - attention awq-inference-engine eetq causal-conv1d @@ -104,6 +103,7 @@ buildPythonPackage { opentelemetry-instrumentation-grpc opentelemetry-semantic-conventions outlines + paged-attention peft pillow prometheus-client diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock index 40062d4e..ec4d0604 100644 --- a/server/hf-kernels.lock +++ b/server/hf-kernels.lock @@ -1,246 +1,246 @@ [ { - "repo_id": "kernels-community/attention", - "sha": "20100e6a97f0fa1465560aa21eecbf4b04d3d93a", + "repo_id": "kernels-community/paged-attention", + "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2", "files": [ { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/attention/_attention_6yvgebnqctora.abi3.so", - "blob_id": "29733cfb726d11a1d278fb0f3679c010cf5210e2" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/attention/_ops.py", - "blob_id": "1379d7cc10c5fafa877e3ea73be33d3eed57b449" + "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_ops.py", + "blob_id": "609570440c63122010e6254ac2f92d4e4e52ec02" }, { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_fao6f4gjjrpl6.abi3.so", + "blob_id": "a4e60f2c567eb63c84430e9b80acaa0aa6974b1e" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/attention/_attention_4jg2igd54wzge.abi3.so", - "blob_id": "a58d380aa758b8e6842e89013229bee3711286ef" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/attention/_ops.py", - "blob_id": "9dee16955e9d988953733fae4e743d92886c92b1" + "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_ops.py", + "blob_id": "9e52382b912b4e2d07f84982f762345debdbbfc8" }, { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_paged_attention_eo7ts45r6k64y.abi3.so", + "blob_id": "c20f9501a41daa820dfda27434674d032931b51e" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/attention/_attention_syg6kbhkhc4xk.abi3.so", - "blob_id": "369150e0964eaca52c0c7906addf9f18d8ec7270" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/attention/_ops.py", - "blob_id": "0bac0403831e313bcf9cbab1a35c2cbe4d5ef08f" + "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_ops.py", + "blob_id": "5f01e3f8c4ae3a031f109f78e010014d34347647" }, { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_5odgyxqhwqtv2.abi3.so", + "blob_id": "74f9714690337f49661c641a4f60f6e1e1f56cfa" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/attention/_attention_hhzgzhvc7zviy.abi3.so", - "blob_id": "05529e8bcee239db92984acb3e19926697c64a3f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/attention/_ops.py", - "blob_id": "270fd3d0005a3e44dc6625c3ab4948a7fa7892bb" + "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_ops.py", + "blob_id": "a3016a6b1cd7ae051012084bbd39d6f2e0913ace" }, { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_uy2moinaww2jc.abi3.so", + "blob_id": "445652acd4719542710cda86a2d08c70a56c8094" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/attention/_attention_gbi5gm244waic.abi3.so", - "blob_id": "cb6cccabe445cbf7bfd797b4645300e5a2a4ec38" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/attention/_ops.py", - "blob_id": "a517876400c08f9800107c61d6ca3f57e0bdc2e6" + "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_ops.py", + "blob_id": "e2cd992a80d4b938f243f0e6060e863278aca7f6" }, { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_paged_attention_35dt23tewn2p2.abi3.so", + "blob_id": "1f6414c382a753edb7512927ac5f3e31b196531d" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/attention/__init__.py", + "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/attention/_attention_ill75rmpj7yds.abi3.so", - "blob_id": "bf93abf5555357ad397844421fcfc66ae0743166" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/attention/_ops.py", - "blob_id": "f49b90de8bda122b2049bf57f5012b60e05364fe" + "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_ops.py", + "blob_id": "150412d67365be8ae5668f83d1939148bb576050" }, { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/attention/platforms.py", + "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_fhq57q56w3m5o.abi3.so", + "blob_id": "ee97eee26a4de8d14d7ccdadaf406eed8405de39" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/attention/_attention_6qe5ft3kiteru.abi3.so", - "blob_id": "0bbd1dc682174c9d7fba2ee7426e1183e668ab79" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/attention/_ops.py", - "blob_id": "f9b2a39308433746718b31f0d9830b27f72f5242" + "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_ops.py", + "blob_id": "2bfef111c96308e595eb628bc88ab660a443089c" }, { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_xvepb4loq5mm2.abi3.so", + "blob_id": "1ea51bd49f8ec76bbe306a261021da52fe6a980f" + }, + { + "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/attention/_attention_ftq3cjdxqfw4m.abi3.so", - "blob_id": "d7fa42c3682924a46e9c5b4a7e847a6b4415c5c8" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/attention/_ops.py", - "blob_id": "27b44593d2252bfe5399c8dcd883aa497223f158" + "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_ops.py", + "blob_id": "8928daeec47128544cef187bf18f214fc2238019" }, { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_uyfdujhnc2xoe.abi3.so", + "blob_id": "cf8ebe40f27db0fa87c46d7b4066494e65843820" + }, + { + "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/attention/_attention_lkibbjh726iwm.abi3.so", - "blob_id": "4a4cccfd49090ac213bbf562a9c4bb2ff2920eb0" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/attention/_ops.py", - "blob_id": "ac89377661ed1c5f2eca40cf199a15209af0c05c" + "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_ops.py", + "blob_id": "dff8537df63e1ef37769a6b7ba6b8c58192d7faa" }, { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_paged_attention_pervvqmod6pi4.abi3.so", + "blob_id": "77eb42e3471e9aa84d1f5d9854995c9737ed6bf3" + }, + { + "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/attention/_attention_vbhagz24hyij6.abi3.so", - "blob_id": "4d87629674e87a746aaec4ccadb26bb2a72f2d43" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/attention/_ops.py", - "blob_id": "2f05f1ffd05c49971dfc9b45971efb5a055c7e52" + "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_ops.py", + "blob_id": "543c64d1589cb1747d7dc1ac29bd8f2cbeb61ab7" }, { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_24rowhxd5ebcc.abi3.so", + "blob_id": "43ec3529d8eac816c31cc1eaad4cc2baa3cbd3d6" + }, + { + "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/attention/_attention_sfjvhlixssyce.abi3.so", - "blob_id": "ee6153972f28bd997e1fc4a7eaaf425fd5adc918" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/attention/_ops.py", - "blob_id": "530d483cdf8243f6c863ca49c0e87018634e69d0" + "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_ops.py", + "blob_id": "1d62b9bb1cfb040d7f68cd108ac9067100b4cf2d" }, { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_5yleoqr3zje4w.abi3.so", + "blob_id": "ffed60cc0a3948bdea6aa7fb4d486d9b943215ec" + }, + { + "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" }, { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/attention/__init__.py", + "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/__init__.py", "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" }, { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/attention/_attention_g7oqtcveiuapk.abi3.so", - "blob_id": "fe58b4ce4158bf5ee55371329396ac8e573cfc85" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/attention/_custom_ops.py", + "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_custom_ops.py", "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" }, { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/attention/_ops.py", - "blob_id": "1e504e67dd25c4aa79bcc509316f3f23e6e3e6ef" + "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_ops.py", + "blob_id": "ee817d13be64b46e3cb44ad192af4a5f3817bbf7" }, { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/attention/platforms.py", + "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_paged_attention_3rbp7xipfucgo.abi3.so", + "blob_id": "5d5b3ffda2fd6a830d12341bab26dc5ec03f4a86" + }, + { + "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/platforms.py", "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" } ] @@ -6737,4 +6737,4 @@ } ] } -] +] \ No newline at end of file diff --git a/server/pyproject.toml b/server/pyproject.toml index 3365940d..3f02b4ec 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -39,7 +39,7 @@ requires = ["hf-kernels>=0.1.2", "setuptools"] build-backend = "setuptools.build_meta" [tool.kernels.dependencies] -"kernels-community/attention" = ">=0.0.1" +"kernels-community/paged-attention" = ">=0.0.2" "kernels-community/moe" = ">=0.1.1" "kernels-community/quantization" = ">=0.0.3" diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index a3afd422..4f25cc19 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -16,15 +16,15 @@ _PARTITION_SIZE = 512 if SYSTEM == "cuda": try: - attention_kernels = load_kernel( - module="attention", repo_id="kernels-community/attention" + paged_attention_kernels = load_kernel( + module="paged_attention", repo_id="kernels-community/paged-attention" ) except Exception as e: raise ImportError( f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}" ) else: - attention_kernels = None + paged_attention_kernels = None def paged_attention( @@ -129,7 +129,7 @@ def paged_attention( max_num_partitions == 1 or num_seqs * num_heads > 512 ) if use_v1: - attention_kernels.paged_attention_v1( + paged_attention_kernels.paged_attention_v1( out, query, kv_cache.key, @@ -160,7 +160,7 @@ def paged_attention( ) max_logits = torch.empty_like(exp_sums) - attention_kernels.paged_attention_v2( + paged_attention_kernels.paged_attention_v2( out, exp_sums, max_logits, diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 6e7cb713..aaf4d2b2 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -13,15 +13,15 @@ from text_generation_server.utils.weights import Weights if SYSTEM == "cuda": try: - attention_kernels = load_kernel( - module="attention", repo_id="kernels-community/attention" + paged_attention = load_kernel( + module="paged_attention", repo_id="kernels-community/paged-attention" ) except Exception as e: raise ImportError( f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}" ) else: - attention_kernels = None + paged_attention = None @dataclass @@ -237,7 +237,7 @@ def paged_reshape_and_cache( if key_cache.dtype == torch.float8_e4m3fn: kv_cache_dtype = "fp8" - attention_kernels.reshape_and_cache( + paged_attention.reshape_and_cache( key, value, key_cache,