Update to attention-kernels 0.2.0 (#2950)

This version removes our patches/custom API. Makes it simpler to get changes from upstream. One of which is that we can enable FP8 KV cache for paged attention as well.
2025-09-08 19:04:52 +00:00 · 2025-01-27 11:42:36 +01:00 · 2025-01-27 11:42:36 +01:00 · db922eb77e
commit db922eb77e
parent 40b00275b2
6 changed files with 87 additions and 146 deletions
--- a/flake.lock
+++ b/flake.lock
@ -853,11 +853,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1737512878,
-        "narHash": "sha256-dgF6htdmfNnZzVInifks6npnCAyVsIHWSpWNs10RSW0=",
+        "lastModified": 1737685583,
+        "narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "06b8ed0eee289fe94c66f1202ced9a6a2c59a14c",
+        "rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a",
        "type": "github"
      },
      "original": {
@ -978,15 +978,16 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1737540114,
-        "narHash": "sha256-ubowOFdG8pAodwuxzWHLIoQJtGXJTlb4RtISVVY3Tx0=",
+        "lastModified": 1737715219,
+        "narHash": "sha256-oIxoNreSeSILjWxcZHXW3cdcoNQHnXO5deXoIiC1tng=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "d18053189cc5ce4111250da841df101c8cdf13ff",
+        "rev": "b91a56628f446c6cb79d224f17c1c66fe1a260f6",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
+        "ref": "attention-kernels-0.2.0",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/attention-kernels-0.2.0";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -68,12 +68,7 @@ gen = [
 ]

 [tool.uv.sources]
-attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
-]
+attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 marlin-kernels = [
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -111,6 +111,8 @@ def paged_attention(

        out = torch.empty_like(query)

+        kv_cache_dtype = "fp8" if kv_cache.dtype == torch.float8_e4m3fn else "auto"
+
        use_v1 = max_s <= 8192 and (
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
@ -120,15 +122,16 @@ def paged_attention(
                query,
                kv_cache.key,
                kv_cache.value,
-                kv_head_mapping,
+                kv_cache.key.shape[1],
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
-                "auto",
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
            )
        else:
            # Run PagedAttention V2.
@ -153,15 +156,16 @@ def paged_attention(
                query,
                kv_cache.key,
                kv_cache.value,
-                kv_head_mapping,
+                kv_cache.key.shape[1],
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
-                "auto",
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
            )
    return out

--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@ -55,10 +55,10 @@ class KVCache:
        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
            if not (
                (ATTENTION == "flashinfer" and SYSTEM == "cuda")
-                or (ATTENTION == "paged" and SYSTEM == "rocm")
+                or (ATTENTION == "paged" and SYSTEM in ("cuda", "rocm"))
            ):
                raise ValueError(
-                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on ROCm. "
+                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on CUDA and ROCm. "
                )
            if SYSTEM == "rocm" and dtype == torch.float8_e5m2:
                raise ValueError(
@ -226,8 +226,13 @@ def paged_reshape_and_cache(
            raise ImportError(
                f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
            )
+
+        kv_cache_dtype = "auto"
+        if key_cache.dtype == torch.float8_e4m3fn:
+            kv_cache_dtype = "fp8"
+
        attention_kernels.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
+            key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
        )
    elif SYSTEM == "rocm":
        try:
--- a/server/uv.lock
+++ b/server/uv.lock
@ -170,79 +170,13 @@ wheels = [

 [[package]]
 name = "attention-kernels"
-version = "0.1.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-]
+version = "0.2.0.post2"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
 dependencies = [
-    { name = "torch", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0a/49/1847d04522158767065d24382b6fe2e0540a63f222113e273745f77ee2c5/attention-kernels-0.1.1.tar.gz", hash = "sha256:aff84a6e61e4720c14a2c2a62242b21c271d5d36bb6a0f0f017b762611b2d477", size = 49652 }
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.10.*'" },
+    { name = "torch" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.11.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.11.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.12.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.12.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf" },
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:863e02dda4b30e9d04ef6cf4d17d16c154f54bdcb8a8b87b8b46075eabf62d25" },
 ]

 [package.metadata]
@ -987,7 +921,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef236
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.10.*'",
 ]
@ -995,7 +929,7 @@ dependencies = [
    { name = "torch", marker = "python_full_version == '3.10.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:bb416d14623dc0ad0eeb2835446c37a41f994555f1baec8701de6d4c1fc17ec8" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:dd91a4e2c3b5e954833c5c34b0322e4c02cd92a967eb94654b6bbcece131340b" },
 ]

 [package.metadata]
@ -1004,7 +938,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.11.*'",
 ]
@ -1012,7 +946,7 @@ dependencies = [
    { name = "torch", marker = "python_full_version == '3.11.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a89bb61d718002d4432158641bce95c6fd68f9ee1a7d5402dd283903397f3185" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:b24d92135fbd156c55ce43158ab4a90fa880ba0df965528895cf1870b03a64bf" },
 ]

 [package.metadata]
@ -1021,7 +955,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.12.*'",
 ]
@ -1029,7 +963,7 @@ dependencies = [
    { name = "torch", marker = "python_full_version == '3.12.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:ed938d196fc5e9cce9fc44cd2b889d5adc5ca7475c8a23858f1474d29e38bdbf" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:8a407f1435a571a8d4ca3b9f533da83fde323043a9836b739cf8018c77782d49" },
 ]

 [package.metadata]
@ -1038,7 +972,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version < '3.10'",
 ]
@ -1046,7 +980,7 @@ dependencies = [
    { name = "torch", marker = "python_full_version < '3.10'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:113c54f68565ad476ca12366b4de92131fa3e9ddb16cbe8ad63272972a15ac28" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:bf7003753c364c504b3998fffdfcf619a42ab04f908903dbad8d54347b6b142b" },
 ]

 [package.metadata]
@ -1078,7 +1012,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/9c/a1/76f32a7ce5b18e5b8
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.10.*'",
 ]
@ -1088,7 +1022,7 @@ dependencies = [
    { name = "triton", marker = "python_full_version == '3.10.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f8c126395f11522881c6bf1f6120e3670822006a84e2ff74af561c22445746b3" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:242d5de087902aa84dff54b6ba140b4066904fe5e7757f934645343b052ab076" },
 ]

 [package.metadata]
@ -1101,7 +1035,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.11.*'",
 ]
@ -1111,7 +1045,7 @@ dependencies = [
    { name = "triton", marker = "python_full_version == '3.11.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:2afff8346251f01d5d90bab738e3dfaa6b14a414a9c88205d396ab2bae87983a" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:8198a3388a03a3248d5f5698097e8ce0a73b6a01f9854fc2338aacc57e554e8a" },
 ]

 [package.metadata]
@ -1124,7 +1058,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version == '3.12.*'",
 ]
@ -1134,7 +1068,7 @@ dependencies = [
    { name = "triton", marker = "python_full_version == '3.12.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:b1a29e33d3b7d85e2b4f8bd47db28211096d1f645e0868d5a1f3666ebb9bd9e3" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:b35fb02ae560b560f4af107791a3308dc97d5ca57d39bab20acec3a0f082ccf2" },
 ]

 [package.metadata]
@ -1147,7 +1081,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
 resolution-markers = [
    "python_full_version < '3.10'",
 ]
@ -1157,7 +1091,7 @@ dependencies = [
    { name = "triton", marker = "python_full_version < '3.10'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:9573611174cda9f6fafa1816521e38582fd2903b321bbaf78f83cf6e3189ac7d" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:cf8d276deb7a4d40fed3eb02e1b6f8d08ccec0f4256260922af13927ca044f56" },
 ]

 [package.metadata]
@ -1460,6 +1394,7 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
    { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]

@ -1468,6 +1403,7 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
    { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]

@ -1476,6 +1412,7 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
    { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]

@ -1484,6 +1421,7 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
    { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]

@ -1506,6 +1444,7 @@ dependencies = [
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
    { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]

@ -1514,6 +1453,7 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
    { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]

@ -1527,6 +1467,7 @@ dependencies = [
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
    { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]

@ -1538,6 +1479,7 @@ dependencies = [
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]

@ -1563,6 +1505,7 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
    { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]

@ -1571,6 +1514,7 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
    { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]

@ -2830,11 +2774,7 @@ accelerate = [
    { name = "accelerate" },
 ]
 attention = [
-    { name = "attention-kernels", version = "0.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "attention-kernels" },
 ]
 bnb = [
    { name = "bitsandbytes" },
@ -2852,17 +2792,17 @@ gen = [
 ]
 marlin = [
    { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
 ]
 moe = [
    { name = "moe-kernels", version = "0.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
 ]
 outlines = [
    { name = "outlines" },
@ -2878,51 +2818,47 @@ quantize = [
 [package.metadata]
 requires-dist = [
    { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" },
-    { name = "attention-kernels", marker = "python_full_version == '3.9.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "(python_full_version < '3.9' and extra == 'attention') or (python_full_version >= '3.13' and extra == 'attention')" },
-    { name = "attention-kernels", marker = "python_full_version == '3.10.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "python_full_version == '3.11.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "python_full_version == '3.12.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "attention-kernels", marker = "extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
    { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" },
    { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" },
    { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" },
    { name = "einops", specifier = ">=0.8.0" },
    { name = "grpc-interceptor", specifier = ">=0.15.4" },
-    { name = "grpcio", specifier = ">=1.69.0" },
-    { name = "grpcio-reflection", specifier = ">=1.69.0" },
-    { name = "grpcio-status", specifier = ">=1.69.0" },
+    { name = "grpcio", specifier = ">=1.67.0" },
+    { name = "grpcio-reflection", specifier = ">=1.67.0" },
+    { name = "grpcio-status", specifier = ">=1.67.0" },
    { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
    { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
-    { name = "hf-transfer", specifier = ">=0.1.9" },
+    { name = "hf-transfer", specifier = ">=0.1.8" },
    { name = "loguru", specifier = ">=0.7.3" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
    { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
    { name = "moe-kernels", marker = "(python_full_version < '3.9' and extra == 'moe') or (python_full_version >= '3.13' and extra == 'moe')" },
-    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
    { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
-    { name = "numpy", specifier = ">=2.0.2" },
-    { name = "opentelemetry-api", specifier = ">=1.29.0" },
-    { name = "opentelemetry-exporter-otlp", specifier = ">=1.29.0" },
+    { name = "numpy", specifier = ">=1.26,<3" },
+    { name = "opentelemetry-api", specifier = ">=1.27.0" },
+    { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" },
    { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" },
    { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
    { name = "pillow", specifier = ">=11.1.0" },
-    { name = "prometheus-client", specifier = ">=0.21.1" },
-    { name = "protobuf", specifier = ">=5.29.3" },
+    { name = "prometheus-client", specifier = ">=0.21.0" },
+    { name = "protobuf", specifier = ">=5.28.3" },
    { name = "py-cpuinfo", specifier = ">=9.0.0" },
    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.3.0,<8" },
-    { name = "rich", specifier = ">=13.9.4" },
-    { name = "safetensors", specifier = ">=0.5.2" },
+    { name = "rich", specifier = ">=13.8.1" },
+    { name = "safetensors", specifier = ">=0.4.5" },
    { name = "scipy", specifier = ">=1.13.1" },
    { name = "sentencepiece", specifier = ">=0.2.0" },
    { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
-    { name = "tokenizers", specifier = ">=0.21.0" },
+    { name = "tokenizers", specifier = ">=0.20.3" },
    { name = "typer", specifier = ">=0.15.1" },
 ]