From de079d607a3af4ae893c12c9ddc339e32b3fb95e Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 2 May 2024 15:09:46 +0200
Subject: [PATCH 01/12] Remove misleading warning (not that important nowadays
 anyway). (#1848)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 .../models/custom_modeling/neox_modeling.py                    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index 1b060060..f060ec0e 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -60,9 +60,6 @@ if (
     except ImportError:
         pass
 
-if not CUSTOM_KERNELS_ENABLED:
-    logger.warning("We're not using custom kernels.")
-
 
 def make_causal_mask(
     input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int

From 65539b743ee2d4dd9b85a9961c33f11b4f687566 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 2 May 2024 10:56:24 -0400
Subject: [PATCH 02/12] feat: prefer huggingface_hub in docs and show image api
 (#1844)

This PR prefers the `huggingface_hub` library, refactors the grammar
docs and adds the new image_url api to the vlm docs.
---
 docs/source/basic_tutorials/using_guidance.md | 332 +++++++-----------
 .../basic_tutorials/visual_language_models.md |  62 +++-
 2 files changed, 197 insertions(+), 197 deletions(-)

diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md
index 606f2453..d0008fdb 100644
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -2,7 +2,7 @@
 
 Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
 
-These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
 
 _note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
 
@@ -74,6 +74,45 @@ curl localhost:3000/generate \
 
 ```
 
+### Hugging Face Hub Python Library
+
+The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+schema = {
+    "properties": {
+        "location": {"title": "Location", "type": "string"},
+        "activity": {"title": "Activity", "type": "string"},
+        "animals_seen": {
+            "maximum": 5,
+            "minimum": 1,
+            "title": "Animals Seen",
+            "type": "integer",
+        },
+        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
+    },
+    "required": ["location", "activity", "animals_seen", "animals"],
+    "title": "Animals",
+    "type": "object",
+}
+
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": schema},
+)
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
+```
+
 A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
 
 > Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
@@ -83,134 +122,55 @@ A grammar can be defined using Pydantic models, JSON schemas, or regular express
 Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
 
 ```python
-import requests
+from huggingface_hub import InferenceClient
 from pydantic import BaseModel, conint
 from typing import List
 
+
 class Animals(BaseModel):
     location: str
     activity: str
     animals_seen: conint(ge=1, le=5)  # Constrained integer type
     animals: List[str]
 
-prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"
 
-data = {
-    "inputs": prompt,
-    "parameters": {
-        "repetition_penalty": 1.3,
-        "grammar": {
-            "type": "json",
-            "value": Animals.schema()
-        }
-    }
-}
+client = InferenceClient("http://localhost:3000")
 
-headers = {
-    "Content-Type": "application/json",
-}
-
-response = requests.post(
-    'http://127.0.0.1:3000/generate',
-    headers=headers,
-    json=data
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": Animals.schema()},
 )
-print(response.json())
-# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'}
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
 
 ```
 
-### JSON Schema Integration
-
-If Pydantic's not your style, go raw with direct JSON Schema integration. This is similar to the first example but with programmatic control.
+defining a grammar as regular expressions
 
 ```python
-import requests
+from huggingface_hub import InferenceClient
 
-json_schema = {
-    "properties": {
-        "location": {
-            "type": "string"
-        },
-        "activity": {
-            "type": "string"
-        },
-        "animals_seen": {
-            "type": "integer",
-            "minimum": 1,
-            "maximum": 5
-        },
-        "animals": {
-            "type": "array",
-            "items": {
-                "type": "string"
-            }
-        }
+client = InferenceClient("http://localhost:3000")
+
+regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+
+resp = client.text_generation(
+    f"Whats Googles DNS? Please use the following regex: {regexp}",
+    seed=42,
+    grammar={
+        "type": "regex",
+        "value": regexp,
     },
-    "required": ["location", "activity", "animals_seen", "animals"]
-}
-
-data = {
-    "inputs": "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park",
-    "parameters": {
-        "max_new_tokens": 200,
-        "repetition_penalty": 1.3,
-        "grammar": {
-            "type": "json",
-            "value": json_schema
-        }
-    }
-}
-
-headers = {
-    "Content-Type": "application/json",
-}
-
-response = requests.post(
-    'http://127.0.0.1:3000/generate',
-    headers=headers,
-    json=data
 )
-print(response.json())
-# {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n  , "animals_seen": 3,\n   "location":"park"}'}
 
-```
 
-### Using the client
-
-TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter.
-
-```python
-from text_generation import AsyncClient
-from text_generation.types import GrammarType
-
-# NOTE: tools defined above and removed for brevity
-
-# Define an async function to encapsulate the async operation
-async def main():
-    client = AsyncClient(base_url="http://localhost:3000")
-
-    # Use 'await' to wait for the async method 'chat' to complete
-    response = await client.generate(
-        "Whats Googles DNS",
-        max_new_tokens=10,
-        decoder_input_details=True,
-        seed=1,
-        grammar={
-            "type": GrammarType.Regex,
-            "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
-        },
-    )
-
-    # Once the response is received, you can process it
-    print(response.generated_text)
-
-# Ensure the main async function is run in the event loop
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
-
-# 118.8.0.84
+print(resp)
+# 7.1.1.1
 
 ```
 
@@ -265,107 +225,87 @@ curl localhost:3000/v1/chat/completions \
 // {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
 ```
 
-### Text Generation Inference Client
+### Chat Completion with Tools
 
-TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions.
+Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
 
 ```python
-from text_generation import AsyncClient
+from huggingface_hub import InferenceClient
 
-# NOTE: tools defined above and removed for brevity
+client = InferenceClient("http://localhost:3000")
 
-# Define an async function to encapsulate the async operation
-async def main():
-    client = AsyncClient(base_url="http://localhost:3000")
-
-    # Use 'await' to wait for the async method 'chat' to complete
-    response = await client.chat(
-        max_tokens=100,
-        seed=1,
-        tools=tools,
-        presence_penalty=-1.1,
-        messages=[
-            {
-                "role": "system",
-                "content": "You're a helpful assistant! Answer the users question best you can.",
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                },
+                "required": ["location", "format"],
             },
-            {
-                "role": "user",
-                "content": "What is the weather like in Brooklyn, New York?",
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_n_day_weather_forecast",
+            "description": "Get an N-day weather forecast",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                    "num_days": {
+                        "type": "integer",
+                        "description": "The number of days to forecast",
+                    },
+                },
+                "required": ["location", "format", "num_days"],
             },
-        ],
-    )
+        },
+    },
+]
 
-    # Once the response is received, you can process it
-    print(response.choices[0].message.tool_calls)
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You're a helpful assistant! Answer the users question best you can.",
+        },
+        {
+            "role": "user",
+            "content": "What is the weather like in Brooklyn, New York?",
+        },
+    ],
+    tools=tools,
+    seed=42,
+    max_tokens=100,
+)
 
-# Ensure the main async function is run in the event loop
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
-
-# {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}}
+print(chat.choices[0].message.tool_calls)
+# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
 
 ```
 
-<details>
-  <summary>Tools used in example above</summary>
-
-```python
-  tools = [
-      {
-          "type": "function",
-          "function": {
-              "name": "get_current_weather",
-              "description": "Get the current weather",
-              "parameters": {
-                  "type": "object",
-                  "properties": {
-                      "location": {
-                          "type": "string",
-                          "description": "The city and state, e.g. San Francisco, CA",
-                      },
-                      "format": {
-                          "type": "string",
-                          "enum": ["celsius", "fahrenheit"],
-                          "description": "The temperature unit to use. Infer this from the users location.",
-                      },
-                  },
-                  "required": ["location", "format"],
-              },
-          },
-      },
-      {
-          "type": "function",
-          "function": {
-              "name": "get_n_day_weather_forecast",
-              "description": "Get an N-day weather forecast",
-              "parameters": {
-                  "type": "object",
-                  "properties": {
-                      "location": {
-                          "type": "string",
-                          "description": "The city and state, e.g. San Francisco, CA",
-                      },
-                      "format": {
-                          "type": "string",
-                          "enum": ["celsius", "fahrenheit"],
-                          "description": "The temperature unit to use. Infer this from the users location.",
-                      },
-                      "num_days": {
-                          "type": "integer",
-                          "description": "The number of days to forecast",
-                      },
-                  },
-                  "required": ["location", "format", "num_days"],
-              },
-          },
-      }
-  ]
-```
-
-</details>
-
 ### OpenAI integration
 
 TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
diff --git a/docs/source/basic_tutorials/visual_language_models.md b/docs/source/basic_tutorials/visual_language_models.md
index e804ef09..3770db0b 100644
--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
@@ -53,7 +53,67 @@ for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
 # This is a picture of an anthropomorphic rabbit in a space suit.
 ```
 
-If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
+or via the `chat_completion` endpoint:
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://127.0.0.1:3000")
+
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    seed=42,
+    max_tokens=100,
+)
+
+print(chat)
+# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
+
+```
+
+or with OpenAi's library:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    stream=False,
+)
+
+print(chat_completion)
+# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
+```
 
 ### Inference Through Sending `cURL` Requests
 

From a25737139d302390edc40ee2d9d92109c7720c04 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 2 May 2024 19:07:10 +0200
Subject: [PATCH 03/12] Updating Phi3 (long context). (#1849)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 Cargo.lock                                    |  8 +-
 router/src/config.rs                          |  1 +
 server/text_generation_server/utils/layers.py | 92 ++++++++++++++++++-
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de54c536..3c6515ad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3393,7 +3393,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "2.0.1"
+version = "2.0.2"
 dependencies = [
  "average",
  "clap",
@@ -3414,7 +3414,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "2.0.1"
+version = "2.0.2"
 dependencies = [
  "futures",
  "grpc-metadata",
@@ -3430,7 +3430,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "2.0.1"
+version = "2.0.2"
 dependencies = [
  "clap",
  "ctrlc",
@@ -3448,7 +3448,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "2.0.1"
+version = "2.0.2"
 dependencies = [
  "async-stream",
  "axum",
diff --git a/router/src/config.rs b/router/src/config.rs
index 88cde69a..8640ede9 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -136,6 +136,7 @@ pub enum Config {
     Phi,
     #[serde(rename = "phi-msft")]
     PhiMsft,
+    Phi3,
     Llama,
     Baichuan,
     Gemma,
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index 6e4a13cd..7d339fe5 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -1029,10 +1029,10 @@ try:
             scaling_factor = None
             rope_scaling = _get_rope_config(config)
             if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
                 if rope_scaling["type"] == "linear":
                     pass
                 elif rope_scaling["type"] == "dynamic":
+                    scaling_factor = rope_scaling["factor"]
                     return DynamicPositionRotaryEmbedding(
                         dim=dim,
                         max_position_embeddings=config.max_position_embeddings,
@@ -1041,6 +1041,7 @@ try:
                         scaling_factor=scaling_factor,
                     )
                 elif rope_scaling["type"] == "yarn":
+                    scaling_factor = rope_scaling["factor"]
                     return YarnPositionRotaryEmbedding(
                         dim=2 * inv_freq.shape[0],
                         max_position_embeddings=rope_scaling[
@@ -1054,6 +1055,52 @@ try:
                         beta_fast=32,
                         beta_slow=1,
                     )
+                elif rope_scaling["type"] == "su":
+                    short_factor = torch.tensor(
+                        rope_scaling["short_factor"], dtype=torch.float32, device=device
+                    )
+                    short_inv_freq = 1.0 / (
+                        short_factor
+                        * base
+                        ** (
+                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                            / dim
+                        )
+                    )
+                    long_factor = torch.tensor(
+                        rope_scaling["long_factor"], dtype=torch.float32, device=device
+                    )
+                    long_inv_freq = 1.0 / (
+                        long_factor
+                        * base
+                        ** (
+                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                            / dim
+                        )
+                    )
+
+                    original_max_position_embeddings = (
+                        config.original_max_position_embeddings
+                    )
+                    max_position_embeddings = config.max_position_embeddings
+                    if max_position_embeddings <= original_max_position_embeddings:
+                        scaling_factor = 1.0
+                    else:
+                        scale = (
+                            max_position_embeddings / original_max_position_embeddings
+                        )
+                        scaling_factor = math.sqrt(
+                            1
+                            + math.log(scale)
+                            / math.log(original_max_position_embeddings)
+                        )
+
+                    return SuRotaryEmbedding(
+                        short_inv_freq=short_inv_freq,
+                        long_inv_freq=long_inv_freq,
+                        scaling_factor=scaling_factor,
+                        original_max_position_embeddings=original_max_position_embeddings,
+                    )
                 else:
                     raise NotImplementedError(
                         f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
@@ -1141,6 +1188,49 @@ try:
             # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
             return cos.unsqueeze(1), sin.unsqueeze(1)
 
+    class SuRotaryEmbedding(PositionRotaryEmbedding):
+        def __init__(
+            self,
+            short_inv_freq,
+            long_inv_freq,
+            scaling_factor,
+            original_max_position_embeddings,
+        ):
+            super(PositionRotaryEmbedding, self).__init__()
+            self.short_inv_freq = short_inv_freq
+            self.long_inv_freq = long_inv_freq
+            self.scaling_factor = scaling_factor
+            self.original_max_position_embeddings = original_max_position_embeddings
+            self._seq_len_cached = 0
+            self._cos_cached = None
+            self._sin_cached = None
+            self._cos_k_cached = None
+            self._sin_k_cached = None
+            self.dynamic_args = None
+
+        def _update_cos_sin_cache(self, dtype, device, seqlen):
+            # Reset the tables if the sequence length has changed,
+            # or if we're on a new device (possibly due to tracing for instance)
+            if (
+                seqlen > self._seq_len_cached
+                or self._cos_cached.device != device
+                or self._cos_cached.dtype != dtype
+            ):
+                self._seq_len_cached = seqlen
+                if seqlen > self.original_max_position_embeddings:
+                    inv_freq = self.long_inv_freq
+                else:
+                    inv_freq = self.short_inv_freq
+                t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
+                if self.scaling_factor is not None:
+                    t /= self.scaling_factor
+                # Don't do einsum, it converts fp32 to fp16
+                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+                freqs = torch.outer(t, inv_freq.to(device=t.device))
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+
     class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
         def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
             inv_freq = _create_inv_freq(dim, base, device)

From bb2b2959a222594dd8a41b8bb18d7b6fe280730a Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Fri, 3 May 2024 16:39:04 +0200
Subject: [PATCH 04/12] Add router name to /info endpoint (#1854)

Add `router` key in `/info` endpoint and set it to
`env!("CARGO_PKG_NAME")` => so always set to `"text-generation-router"`
in TGI. Happy to change the naming if you think of a better one
(framework? package_name?)

The goal is to use this information in `InferenceClient` to know the
model is served with TGI. At the moment we can use
https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2/info
to infer it is TGI-served because it returns information but having a
proper key would be better.


For context, a transformers-served model is only outputting `{"ok":
"ok"}` (see
[here](https://api-inference.huggingface.co/models/microsoft/DialoGPT-large/info)).
---
 router/src/lib.rs    | 2 ++
 router/src/server.rs | 1 +
 2 files changed, 3 insertions(+)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index fac4c14e..96a9fdf6 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -159,6 +159,8 @@ pub struct Info {
     #[schema(example = "32")]
     pub max_client_batch_size: usize,
     /// Router Info
+    #[schema(example = "text-generation-router")]
+    pub router: &'static str,
     #[schema(example = "0.5.0")]
     pub version: &'static str,
     #[schema(nullable = true, example = "null")]
diff --git a/router/src/server.rs b/router/src/server.rs
index 8657b779..5dbde21e 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1564,6 +1564,7 @@ pub async fn run(
         max_batch_size,
         validation_workers,
         max_client_batch_size,
+        router: env!("CARGO_PKG_NAME"),
         version: env!("CARGO_PKG_VERSION"),
         sha: option_env!("VERGEN_GIT_SHA"),
         docker_label: option_env!("DOCKER_LABEL"),

From ac7076b64db3c3896ca9f84e9484fb16788c7f2c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 6 May 2024 13:48:11 +0200
Subject: [PATCH 05/12] Upgrading to rust 1.78. (#1851)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 .github/workflows/tests.yaml    |   7 +-
 Cargo.lock                      | 171 +++++++++++++++++---------------
 Dockerfile                      |   2 +-
 Dockerfile_amd                  |   2 +-
 Dockerfile_intel                |   2 +-
 benchmark/src/event.rs          |   6 +-
 launcher/Cargo.toml             |   1 +
 launcher/src/main.rs            |  19 ++--
 router/grpc-metadata/src/lib.rs |  23 +----
 router/src/infer.rs             |   4 +-
 router/src/server.rs            |  13 +--
 rust-toolchain.toml             |   7 +-
 12 files changed, 124 insertions(+), 133 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 29ff6d45..37dc8305 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -33,10 +33,9 @@ jobs:
       - name: Install Rust
         uses: actions-rs/toolchain@v1
         with:
-          # Released on: 28 December, 2023
-          # Branched from master on: 10 November, 2023
-          # https://releases.rs/docs/1.75.0/
-          toolchain: 1.75.0
+          # Released on: 02 May, 2024
+          # https://releases.rs/docs/1.78.0/
+          toolchain: 1.78.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
diff --git a/Cargo.lock b/Cargo.lock
index 3c6515ad..138b6676 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -48,47 +48,48 @@ checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
 [[package]]
 name = "anstream"
-version = "0.6.13"
+version = "0.6.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
+checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
 dependencies = [
  "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
 dependencies = [
  "anstyle",
  "windows-sys 0.52.0",
@@ -321,9 +322,9 @@ checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "base64"
-version = "0.22.0"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "bit-set"
@@ -387,9 +388,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytecount"
-version = "0.6.7"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205"
+checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
 
 [[package]]
 name = "bytemuck"
@@ -403,6 +404,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
+[[package]]
+name = "byteorder-lite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
+
 [[package]]
 name = "bytes"
 version = "1.6.0"
@@ -449,12 +456,13 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 
 [[package]]
 name = "cc"
-version = "1.0.94"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7"
+checksum = "065a29261d53ba54260972629f9ca6bffa69bac13cd1fed61420f7fa68b9f8bd"
 dependencies = [
  "jobserver",
  "libc",
+ "once_cell",
 ]
 
 [[package]]
@@ -527,9 +535,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorchoice"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
 
 [[package]]
 name = "console"
@@ -872,9 +880,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.2"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "fdeflate"
@@ -893,9 +901,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.28"
+version = "1.0.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
+checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -1155,9 +1163,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 
 [[package]]
 name = "heck"
@@ -1339,11 +1347,11 @@ dependencies = [
 
 [[package]]
 name = "image-webp"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a84a25dcae3ac487bc24ef280f9e20c79c9b1a3e5e32cbed3041d1c514aa87c"
+checksum = "d730b085583c4d789dfd07fdcf185be59501666a90c97c40162b37e4fdad272d"
 dependencies = [
- "byteorder",
+ "byteorder-lite",
  "thiserror",
 ]
 
@@ -1370,7 +1378,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.3",
+ "hashbrown 0.14.5",
  "serde",
 ]
 
@@ -1432,6 +1440,12 @@ version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
 [[package]]
 name = "iso8601"
 version = "0.6.1"
@@ -1476,9 +1490,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.30"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "685a7d121ee3f65ae4fddd72b25a04bb36b6af81bc0828f7d5434c0fe60fa3a2"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
 dependencies = [
  "libc",
 ]
@@ -1542,9 +1556,9 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.154"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346"
 
 [[package]]
 name = "libfuzzer-sys"
@@ -1581,9 +1595,9 @@ checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
 [[package]]
 name = "lock_api"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -2267,9 +2281,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -2277,15 +2291,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
  "redox_syscall",
  "smallvec",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -2696,11 +2710,11 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.4.1"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
 ]
 
 [[package]]
@@ -2889,9 +2903,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.32"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
  "bitflags 2.5.0",
  "errno",
@@ -2914,9 +2928,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.22.3"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
  "ring 0.17.8",
@@ -2937,15 +2951,15 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.4.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247"
+checksum = "beb461507cee2c2ff151784c52762cf4d9ff6a61f3e80968600ed24fa837fa54"
 
 [[package]]
 name = "rustls-webpki"
-version = "0.102.2"
+version = "0.102.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+checksum = "f3bce581c0dd41bce533ce695a1437fa16a7ab5ac3ccfa99fe1a620a7885eabf"
 dependencies = [
  "ring 0.17.8",
  "rustls-pki-types",
@@ -3032,18 +3046,18 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.198"
+version = "1.0.200"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc"
+checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.198"
+version = "1.0.200"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9"
+checksum = "856f046b9400cee3c8c94ed572ecdb752444c24528c035cd35882aad6f492bcb"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3144,9 +3158,9 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.1"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
 dependencies = [
  "libc",
 ]
@@ -3198,9 +3212,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.5.6"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -3303,9 +3317,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
 [[package]]
 name = "sysinfo"
-version = "0.30.10"
+version = "0.30.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d7c217777061d5a2d652aea771fb9ba98b6dade657204b08c4b9604d11555b"
+checksum = "87341a165d73787554941cd5ef55ad728011566fe714e987d1b976c15dbc3a83"
 dependencies = [
  "cfg-if",
  "core-foundation-sys",
@@ -3441,6 +3455,7 @@ dependencies = [
  "reqwest",
  "serde",
  "serde_json",
+ "thiserror",
  "tracing",
  "tracing-subscriber",
  "vergen",
@@ -3453,7 +3468,7 @@ dependencies = [
  "async-stream",
  "axum",
  "axum-tracing-opentelemetry",
- "base64 0.22.0",
+ "base64 0.22.1",
  "clap",
  "futures",
  "futures-util",
@@ -3490,18 +3505,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
+checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
+checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3720,9 +3735,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.22.9"
+version = "0.22.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e40bb779c5187258fd7aad0eb68cb8706a0a81fa712fbea808ab43c4b8374c4"
+checksum = "d3328d4f68a705b2a4498da1d580585d39a6510f98318a2cec3018a7ec61ddef"
 dependencies = [
  "indexmap 2.2.6",
  "serde",
@@ -4023,9 +4038,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.11"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
+checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6"
 
 [[package]]
 name = "unicode_categories"
@@ -4047,16 +4062,16 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.9.6"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
- "base64 0.21.7",
+ "base64 0.22.1",
  "flate2",
  "log",
  "native-tls",
  "once_cell",
- "rustls 0.22.3",
+ "rustls 0.22.4",
  "rustls-pki-types",
  "rustls-webpki",
  "serde",
@@ -4330,11 +4345,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.6"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
 dependencies = [
- "winapi",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4569,9 +4584,9 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 
 [[package]]
 name = "winnow"
-version = "0.6.6"
+version = "0.6.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0c976aaaa0e1f90dbb21e9587cdaf1d9679a1cde8875c0d6bd83ab96a208352"
+checksum = "14b9415ee827af173ebb3f15f9083df5a122eb93572ec28741fb153356ea2578"
 dependencies = [
  "memchr",
 ]
diff --git a/Dockerfile b/Dockerfile
index 333f7c91..0471450f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index fb820116..57a7c637 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index d0791cac..8c7478c1 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/benchmark/src/event.rs b/benchmark/src/event.rs
index 91ce8400..07482aed 100644
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@@ -11,7 +11,7 @@ pub(crate) enum Event {
     /// Key press.
     Key(event::KeyEvent),
     /// Terminal resize.
-    Resize(u16, u16),
+    Resize,
 }
 
 pub(crate) async fn terminal_event_task(
@@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
         if event::poll(Duration::from_secs(0)).expect("no events available") {
             match event::read().expect("unable to read event") {
                 event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
-                event::Event::Resize(w, h) => {
-                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                event::Event::Resize(_w, _h) => {
+                    event_sender.send(Event::Resize).await.unwrap_or(())
                 }
                 _ => (),
             }
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index 6b6fd58e..eb219423 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 serde = { version = "1.0.188", features = ["derive"] }
 serde_json = "1.0.107"
+thiserror = "1.0.59"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 23944f40..f2f5a99b 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -16,6 +16,7 @@ use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
 use std::{fs, io};
+use thiserror::Error;
 use tracing_subscriber::EnvFilter;
 
 mod env_runtime;
@@ -819,26 +820,26 @@ fn find_num_shards(
     Ok(num_shard)
 }
 
-#[derive(Debug)]
+#[derive(Debug, Error)]
 enum LauncherError {
+    #[error("Invalid argument: {0}")]
     ArgumentValidation(String),
+    #[error("not enough cuda devices: {0}")]
     NotEnoughCUDADevices(String),
+    #[error("Download error")]
     DownloadError,
+    #[error("Shard cannot start")]
     ShardCannotStart,
+    #[error("Shard disconnected")]
     ShardDisconnected,
+    #[error("Shard failed")]
     ShardFailed,
+    #[error("Webserver failed")]
     WebserverFailed,
+    #[error("Webserver cannot start")]
     WebserverCannotStart,
 }
 
-impl core::fmt::Display for LauncherError {
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "{self:?}")
-    }
-}
-
-impl std::error::Error for LauncherError {}
-
 fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
     // Enter download tracing span
     let _span = tracing::span!(tracing::Level::INFO, "download").entered();
diff --git a/router/grpc-metadata/src/lib.rs b/router/grpc-metadata/src/lib.rs
index 7ba353fa..3068a61c 100644
--- a/router/grpc-metadata/src/lib.rs
+++ b/router/grpc-metadata/src/lib.rs
@@ -2,30 +2,9 @@
 //! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples
 
 use opentelemetry::global;
-use opentelemetry::propagation::{Extractor, Injector};
+use opentelemetry::propagation::Injector;
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 
-/// Extract context metadata from a gRPC request's metadata
-struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap);
-
-impl<'a> Extractor for MetadataExtractor<'a> {
-    /// Get a value for a key from the MetadataMap.  If the value can't be converted to &str, returns None
-    fn get(&self, key: &str) -> Option<&str> {
-        self.0.get(key).and_then(|metadata| metadata.to_str().ok())
-    }
-
-    /// Collect all the keys from the MetadataMap.
-    fn keys(&self) -> Vec<&str> {
-        self.0
-            .keys()
-            .map(|key| match key {
-                tonic::metadata::KeyRef::Ascii(v) => v.as_str(),
-                tonic::metadata::KeyRef::Binary(v) => v.as_str(),
-            })
-            .collect::<Vec<_>>()
-    }
-}
-
 /// Inject context in the metadata of a gRPC request.
 struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
 
diff --git a/router/src/infer.rs b/router/src/infer.rs
index da0db072..a4945512 100644
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@@ -1249,7 +1249,7 @@ mod tests {
             },
         ];
 
-        let example_chat_with_system = vec![Message {
+        let example_chat_with_system = [Message {
             role: "system".to_string(),
             content: Some(
                 "You are a friendly chatbot who always responds in the style of a pirate"
@@ -1373,7 +1373,7 @@ mod tests {
         {
             let mut env = Environment::new();
             env.add_function("raise_exception", raise_exception);
-            let tmpl = env.template_from_str(&chat_template);
+            let tmpl = env.template_from_str(chat_template);
             let result = tmpl.unwrap().render(input).unwrap();
             assert_eq!(result, target);
         }
diff --git a/router/src/server.rs b/router/src/server.rs
index 5dbde21e..cb55d897 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -696,7 +696,7 @@ async fn completions(
                             model: model_id.clone(),
                             system_fingerprint: system_fingerprint.clone(),
                         })
-                        .map_or_else(|_e| Event::default(), |data| data)
+                        .unwrap_or_else(|_e| Event::default())
                 };
 
                 let (header_tx, header_rx) = oneshot::channel();
@@ -1122,13 +1122,10 @@ async fn chat_completions(
                     logprobs,
                     stream_token.details.map(|d| d.finish_reason.to_string()),
                 ))
-                .map_or_else(
-                    |e| {
-                        println!("Failed to serialize ChatCompletionChunk: {:?}", e);
-                        Event::default()
-                    },
-                    |data| data,
-                )
+                .unwrap_or_else(|e| {
+                    println!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                    Event::default()
+                })
         };
 
         let (headers, response_stream) = generate_stream_internal(
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 67982433..507ee859 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,6 +1,5 @@
 [toolchain]
-# Released on: 28 December, 2023
-# Branched from master on: 10 November, 2023
-# https://releases.rs/docs/1.75.0/
-channel = "1.75.0"
+# Released on: 02 May, 2024
+# https://releases.rs/docs/1.78.0/
+channel = "1.78.0"
 components = ["rustfmt", "clippy"]

From 59b3ffea1442e40b37ba92e84687ea9784212556 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 6 May 2024 22:05:43 +0800
Subject: [PATCH 06/12] update xpu docker image and use public ipex whel
 (#1860)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 8c7478c1..5bc39d64 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -36,7 +36,7 @@ RUN cargo build --release
 
 
 # Text Generation Inference base image for Intel
-FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
+FROM intel/intel-extension-for-pytorch:2.1.30-xpu as base
 
 USER root
 # libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
@@ -47,7 +47,7 @@ RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi
 
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
@@ -56,9 +56,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
 
 
 WORKDIR /usr/src
-# Build pytorch and ipex
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
-RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
+RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+RUN pip install intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
 
 # Install server
 COPY proto proto
@@ -72,25 +71,11 @@ RUN cd server && \
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
 ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
-ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh
-ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
-ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man
-ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
-ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
 ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
-ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so
-ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar
 ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
-ENV MKLROOT=/opt/intel/oneapi/mkl/latest
-ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N
 ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 
-
-RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
-RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
-
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router

From fd89d9dfaef408e4482ba5e230473de5bb203a99 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 13 May 2024 12:44:30 +0200
Subject: [PATCH 07/12] Refactor layers. (#1866)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 server/tests/utils/test_layers.py             |    2 +-
 .../text_generation_server/layers/__init__.py |   14 +
 .../{utils => layers}/awq/conversion_utils.py |    0
 .../{utils => layers}/awq/quantize/qmodule.py |    0
 server/text_generation_server/layers/bnb.py   |  106 ++
 server/text_generation_server/layers/conv.py  |   41 +
 server/text_generation_server/layers/eetq.py  |   25 +
 server/text_generation_server/layers/fp8.py   |   43 +
 .../layers/gptq/__init__.py                   |   39 +
 .../{utils => layers}/gptq/custom_autotune.py |    0
 .../{utils => layers}/gptq/exllama.py         |    0
 .../{utils => layers}/gptq/exllamav2.py       |    2 +
 .../layers/gptq/quant_linear.py               |  356 +++++
 .../{utils => layers}/gptq/quantize.py        |    0
 .../layers/layernorm.py                       |  185 +++
 .../text_generation_server/layers/linear.py   |  153 ++
 .../text_generation_server/layers/medusa.py   |  186 +++
 .../text_generation_server/layers/rotary.py   |  419 +++++
 .../layers/speculative.py                     |   35 +
 .../layers/tensor_parallel.py                 |  188 +++
 .../models/cache_manager.py                   |    4 +-
 .../models/custom_modeling/bloom_modeling.py  |    2 +-
 .../models/custom_modeling/clip.py            |    2 +-
 .../custom_modeling/flash_cohere_modeling.py  |   18 +-
 .../custom_modeling/flash_dbrx_modeling.py    |   19 +-
 .../custom_modeling/flash_gemma_modeling.py   |    6 +-
 .../custom_modeling/flash_llama_modeling.py   |    6 +-
 .../custom_modeling/flash_mistral_modeling.py |    6 +-
 .../custom_modeling/flash_mixtral_modeling.py |   14 +-
 .../custom_modeling/flash_neox_modeling.py    |   10 +-
 .../custom_modeling/flash_phi_modeling.py     |    8 +-
 .../custom_modeling/flash_qwen2_modeling.py   |    6 +-
 .../custom_modeling/flash_rw_modeling.py      |   10 +-
 .../flash_santacoder_modeling.py              |   10 +-
 .../flash_starcoder2_modeling.py              |   10 +-
 .../models/custom_modeling/idefics2.py        |    2 +-
 .../custom_modeling/idefics_modeling.py       |   16 +-
 .../custom_modeling/idefics_perceiver.py      |    2 +-
 .../models/custom_modeling/idefics_vision.py  |    2 +-
 .../models/custom_modeling/llava_next.py      |    2 +-
 .../models/custom_modeling/mamba_modeling.py  |    4 +-
 .../models/custom_modeling/mpt_modeling.py    |    2 +-
 .../models/custom_modeling/neox_modeling.py   |    2 +-
 .../models/custom_modeling/opt_modeling.py    |    2 +-
 .../models/custom_modeling/phi_modeling.py    |    2 +-
 .../models/custom_modeling/t5_modeling.py     |    2 +-
 .../models/flash_causal_lm.py                 |   36 +-
 .../models/flash_llama.py                     |    4 +-
 .../models/flash_mistral.py                   |    4 +-
 .../models/flash_neox.py                      |    4 +-
 .../text_generation_server/models/flash_rw.py |    4 +-
 .../models/flash_santacoder.py                |    4 +-
 server/text_generation_server/server.py       |    2 +-
 .../utils/flash_attn.py                       |  197 ++-
 .../utils/gptq/quant_linear.py                |  359 -----
 .../utils/import_utils.py                     |   41 +-
 server/text_generation_server/utils/layers.py | 1374 -----------------
 .../utils/paged_attention.py                  |   24 +-
 .../text_generation_server/utils/weights.py   |   10 +-
 59 files changed, 2085 insertions(+), 1941 deletions(-)
 create mode 100644 server/text_generation_server/layers/__init__.py
 rename server/text_generation_server/{utils => layers}/awq/conversion_utils.py (100%)
 rename server/text_generation_server/{utils => layers}/awq/quantize/qmodule.py (100%)
 create mode 100644 server/text_generation_server/layers/bnb.py
 create mode 100644 server/text_generation_server/layers/conv.py
 create mode 100644 server/text_generation_server/layers/eetq.py
 create mode 100644 server/text_generation_server/layers/fp8.py
 create mode 100644 server/text_generation_server/layers/gptq/__init__.py
 rename server/text_generation_server/{utils => layers}/gptq/custom_autotune.py (100%)
 rename server/text_generation_server/{utils => layers}/gptq/exllama.py (100%)
 rename server/text_generation_server/{utils => layers}/gptq/exllamav2.py (96%)
 create mode 100644 server/text_generation_server/layers/gptq/quant_linear.py
 rename server/text_generation_server/{utils => layers}/gptq/quantize.py (100%)
 create mode 100644 server/text_generation_server/layers/layernorm.py
 create mode 100644 server/text_generation_server/layers/linear.py
 create mode 100644 server/text_generation_server/layers/medusa.py
 create mode 100644 server/text_generation_server/layers/rotary.py
 create mode 100644 server/text_generation_server/layers/speculative.py
 create mode 100644 server/text_generation_server/layers/tensor_parallel.py
 delete mode 100644 server/text_generation_server/utils/gptq/quant_linear.py
 delete mode 100644 server/text_generation_server/utils/layers.py

diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py
index 93a0e982..9a8da0d6 100644
--- a/server/tests/utils/test_layers.py
+++ b/server/tests/utils/test_layers.py
@@ -1,5 +1,5 @@
 import torch
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
 )
 
diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py
new file mode 100644
index 00000000..c3a6c921
--- /dev/null
+++ b/server/text_generation_server/layers/__init__.py
@@ -0,0 +1,14 @@
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+from text_generation_server.layers.speculative import SpeculativeHead
+from text_generation_server.layers.linear import (
+    get_linear,
+    FastLinear,
+)
+
+# Just to add the `load` methods.
+from text_generation_server.layers.layernorm import load_layer_norm
+from text_generation_server.layers.conv import load_conv2d
diff --git a/server/text_generation_server/utils/awq/conversion_utils.py b/server/text_generation_server/layers/awq/conversion_utils.py
similarity index 100%
rename from server/text_generation_server/utils/awq/conversion_utils.py
rename to server/text_generation_server/layers/awq/conversion_utils.py
diff --git a/server/text_generation_server/utils/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/qmodule.py
similarity index 100%
rename from server/text_generation_server/utils/awq/quantize/qmodule.py
rename to server/text_generation_server/layers/awq/quantize/qmodule.py
diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py
new file mode 100644
index 00000000..d27a33a1
--- /dev/null
+++ b/server/text_generation_server/layers/bnb.py
@@ -0,0 +1,106 @@
+import torch
+from loguru import logger
+from functools import lru_cache
+import bitsandbytes as bnb
+from bitsandbytes.nn import Int8Params, Params4bit
+
+
+@lru_cache(1)
+def warn_deprecate_bnb():
+    logger.warning(
+        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
+    )
+
+
+class Linear8bitLt(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        has_fp16_weights=True,
+        memory_efficient_backward=False,
+        threshold=0.0,
+        index=None,
+    ):
+        super().__init__()
+        assert (
+            not memory_efficient_backward
+        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
+        self.state = bnb.MatmulLtState()
+        self.index = index
+
+        # Necessary for stacked layers
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+
+        self.weight = Int8Params(
+            weight.data,
+            has_fp16_weights=has_fp16_weights,
+            requires_grad=has_fp16_weights,
+        )
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def init_8bit_state(self):
+        self.state.CB = self.weight.CB
+        self.state.SCB = self.weight.SCB
+        self.weight.CB = None
+        self.weight.SCB = None
+
+    def forward(self, x: torch.Tensor):
+        self.state.is_training = self.training
+        if self.weight.CB is not None:
+            self.init_8bit_state()
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+
+        if not self.state.has_fp16_weights:
+            if self.state.CB is not None and self.state.CxB is not None:
+                # we converted 8-bit row major to turing/ampere format in the first inference pass
+                # we no longer need the row-major weight
+                del self.state.CB
+                self.weight.data = self.state.CxB
+        return out
+
+
+class Linear4bit(nn.Module):
+    def __init__(self, weight, bias, quant_type):
+        super().__init__()
+        self.weight = Params4bit(
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
+        )
+        self.compute_dtype = None
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def forward(self, x: torch.Tensor):
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if getattr(self.weight, "quant_state", None) is None:
+            print(
+                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
+            )
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+        out = bnb.matmul_4bit(
+            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
+        )
+
+        out = out.to(inp_dtype)
+
+        return out
diff --git a/server/text_generation_server/layers/conv.py b/server/text_generation_server/layers/conv.py
new file mode 100644
index 00000000..7fb18ab3
--- /dev/null
+++ b/server/text_generation_server/layers/conv.py
@@ -0,0 +1,41 @@
+from accelerate import init_empty_weights
+import torch
+
+
+@classmethod
+def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = torch.nn.Parameter(bias)
+    return conv2d
+
+
+@classmethod
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = None
+    return conv2d
+
+
+torch.nn.Conv2d.load = load_conv2d
+torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py
new file mode 100644
index 00000000..fd22b5c6
--- /dev/null
+++ b/server/text_generation_server/layers/eetq.py
@@ -0,0 +1,25 @@
+import torch
+from EETQ import quant_weights, w8_a16_gemm
+
+
+class EETQLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        device = weight.device
+        if weight.dtype != torch.float16:
+            weight = weight.to(dtype=torch.float16)
+        weight = torch.t(weight).contiguous().cpu()
+        weight, scale = quant_weights(weight, torch.int8, False)
+
+        self.weight = weight.cuda(device)
+        self.scale = scale.cuda(device)
+        self.bias = bias.cuda(device) if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = output + self.bias if self.bias is not None else output
+        return output
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
new file mode 100644
index 00000000..dd61d081
--- /dev/null
+++ b/server/text_generation_server/layers/fp8.py
@@ -0,0 +1,43 @@
+import torch
+
+
+def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
+    device = weight.device
+    # weight, scale = quant_weights(weight, torch.int8, False)
+    finfo = torch.finfo(qdtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(qdtype)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+
+class Fp8Linear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.dtype = weight.dtype
+        self.qweight, self.scale = fp8_quantize(weight)
+
+        self.bias = bias if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        qinput, scale = fp8_quantize(input)
+        output, _ = torch._scaled_mm(
+            qinput,
+            self.qweight.t(),
+            out_dtype=self.dtype,
+            scale_a=scale,
+            scale_b=self.scale,
+            bias=self.bias,
+        )
+        return output
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
new file mode 100644
index 00000000..1c46f493
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -0,0 +1,39 @@
+import os
+import torch
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+except Exception:
+    major = 1
+
+HAS_EXLLAMA = False
+CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
+V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
+if os.getenv("DISABLE_EXLLAMA") == "True":
+    HAS_EXLLAMA = False
+elif CAN_EXLLAMA:
+    try:
+        if V2:
+            from text_generation_server.layers.gptq.exllamav2 import (
+                QuantLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "2"
+        else:
+            from text_generation_server.layers.gptq.exllama import (
+                Ex4bitLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "1"
+
+    except ImportError:
+        pass
+
+from text_generation_server.layers.gptq.quant_linear import QuantLinear
diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/custom_autotune.py
rename to server/text_generation_server/layers/gptq/custom_autotune.py
diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/layers/gptq/exllama.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/exllama.py
rename to server/text_generation_server/layers/gptq/exllama.py
diff --git a/server/text_generation_server/utils/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py
similarity index 96%
rename from server/text_generation_server/utils/gptq/exllamav2.py
rename to server/text_generation_server/layers/gptq/exllamav2.py
index 80836a95..321ced97 100644
--- a/server/text_generation_server/utils/gptq/exllamav2.py
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
                 none_tensor,
                 temp_dq,
             )
+    else:
+        RuntimeError("Cannot create handle")
 
 
 DEVICE = None
diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/quant_linear.py
new file mode 100644
index 00000000..f60758b6
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/quant_linear.py
@@ -0,0 +1,356 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_fwd
+
+import triton
+import triton.language as tl
+from . import custom_autotune
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = (zeros + 1) & maxq  # eventually avoid overflow
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
+        grid = lambda META: (
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = QuantLinearFunction.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
similarity index 100%
rename from server/text_generation_server/utils/gptq/quantize.py
rename to server/text_generation_server/layers/gptq/quantize.py
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
new file mode 100644
index 00000000..15d24e80
--- /dev/null
+++ b/server/text_generation_server/layers/layernorm.py
@@ -0,0 +1,185 @@
+import torch
+from torch import nn
+from accelerate import init_empty_weights
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = torch.nn.Parameter(bias)
+    return ln
+
+
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+
+                return normed_hidden_states, residual
+
+elif SYSTEM == "rocm":
+    from vllm import layernorm_ops
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            return super().forward(hidden_states), residual
+
+elif SYSTEM == "xpu":
+    import intel_extension_for_pytorch as ipex
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            res_out = hidden_states
+            out = ipex.llm.functional.add_layer_norm(
+                residual, hidden_states, self.weight, self.bias, self.eps, True
+            )
+            if residual is not None:
+                res_out = residual
+            return out, res_out
+
+
+class FastRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float):
+        super().__init__()
+
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        return cls(weight, eps)
+
+    def forward(self, hidden_states, residual=None):
+        if SYSTEM == "xpu":
+            residual_out = hidden_states
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                True,
+            )
+            if residual is not None:
+                residual_out = residual
+            return out, residual_out
+        elif hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states, residual
+        elif SYSTEM == "cuda":
+            # faster post attention rms norm
+            (
+                normed_hidden_states,
+                res,
+                *rest,
+            ) = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            return normed_hidden_states, res
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            out = torch.empty_like(hidden_states)
+            layernorm_ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return out, residual
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py
new file mode 100644
index 00000000..d137a500
--- /dev/null
+++ b/server/text_generation_server/layers/linear.py
@@ -0,0 +1,153 @@
+import torch
+from torch.nn import functional as F
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+class FastLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+
+def get_linear(weight, bias, quantize):
+    if quantize is None:
+        linear = FastLinear(weight, bias)
+    elif quantize == "eetq":
+        try:
+            from text_generation_server.layers.eetq import EETQLinear
+
+            linear = EETQLinear(weight, bias)
+        except ImportError:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
+    elif quantize == "fp8":
+        from text_generation_server.layers.fp8 import Fp8Linear
+
+        linear = Fp8Linear(weight, bias)
+    elif quantize == "bitsandbytes":
+        try:
+            from text_generation_server.layers.bnb import (
+                warn_deprecate_bnb,
+                Linear8bitLt,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        warn_deprecate_bnb()
+        linear = Linear8bitLt(
+            weight,
+            bias,
+            has_fp16_weights=False,
+            threshold=6.0,
+        )
+        if bias is not None:
+            linear.bias = nn.Parameter(bias)
+    elif quantize == "bitsandbytes-fp4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="fp4",
+        )
+    elif quantize == "bitsandbytes-nf4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="nf4",
+        )
+    elif quantize == "gptq":
+        try:
+            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
+        except Exception:
+            raise NotImplementedError(
+                f"The passed weight is not `gptq` compatible, loader needs to be updated."
+            )
+
+        if use_exllama:
+            try:
+                from text_generation_server.layers.gptq import (
+                    ExllamaQuantLinear,
+                )
+            except ImportError:
+                raise NotImplementedError(
+                    f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
+                )
+
+            linear = ExllamaQuantLinear(
+                qweight, qzeros, scales, g_idx, bias, bits, groupsize
+            )
+        else:
+            from text_generation_server.layers.gptq.quant_linear import QuantLinear
+
+            linear = QuantLinear(
+                qweight,
+                qzeros,
+                scales,
+                g_idx,
+                bias,
+                bits,
+                groupsize,
+            )
+    elif quantize == "awq":
+        try:
+            qweight, qzeros, scales, _, bits, groupsize, _ = weight
+        except Exception:
+            raise NotImplementedError(
+                f"The passed weight is not `awq` compatible, loader needs to be updated."
+            )
+        if SYSTEM == "rocm":
+            raise NotImplementedError(
+                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
+                "to use Exllama/GPTQ kernels for AWQ inference."
+            )
+        try:
+            from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+
+            linear = WQLinear(
+                w_bit=bits,
+                group_size=groupsize,
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                bias=bias is not None,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+            )
+    else:
+        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
+    return linear
diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py
new file mode 100644
index 00000000..4ac86978
--- /dev/null
+++ b/server/text_generation_server/layers/medusa.py
@@ -0,0 +1,186 @@
+import torch
+from torch import nn
+from typing import Tuple, Optional
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.layers.linear import FastLinear
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelHead,
+    TensorParallelColumnLinear,
+)
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.linear = FastLinear.load(
+            config, prefix=f"{prefix}.linear", weights=weights, bias=True
+        )
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(torch.nn.Module):
+    def __init__(self, config, medusa_config, weights):
+        super().__init__()
+        self.heads = torch.nn.ModuleList(
+            [
+                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
+                for i in range(get_speculate())
+            ]
+        )
+
+    def forward(self, x):
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
+        return speculative_logits
+
+
+class MedusaHead(torch.nn.Module):
+    def __init__(self, config, medusa_config, prefix, weights):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList(
+            [
+                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
+                for i in range(medusa_config["medusa_num_layers"])
+            ]
+        )
+        n = len(self.blocks)
+        self.out = FastLinear.load(
+            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
+        )
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.out(x)
+        return x
+
+
+class MedusaHeadV1(nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.lm_head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        use_medusa = config.use_medusa
+
+        medusa_config = str(Path(use_medusa) / "config.json")
+        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        medusa = MedusaModel(config, medusa_config, weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MedusaHeadV1(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        speculative_logits = self.medusa(input)
+        return logits, speculative_logits
+
+
+class MedusaHeadV2(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        use_medusa = config.use_medusa
+
+        medusa_config = str(Path(use_medusa) / "config.json")
+        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        self.n_medusa_heads = get_speculate()
+
+        assert medusa_config["medusa_num_layers"] == 1
+        self.linear = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.process_group = weights.process_group
+        self.world_size = self.process_group.size()
+        self.rank = self.process_group.rank()
+
+        self.act = torch.nn.SiLU()
+
+        self.lm_head = TensorParallelHead.load(config, prefix, weights)
+
+    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
+        size = x.shape[-1]
+        block_size = (size + self.world_size - 1) // self.world_size
+        start = self.rank * block_size
+        stop = (self.rank + 1) * block_size
+
+        x_block = x[:, start:stop]
+
+        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
+        medusa_res = self.act(self.linear(x)).reshape(
+            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
+        )
+
+        # Apply all residual medusa heads
+        output = x[:, start:stop].unsqueeze(-2) + medusa_res
+
+        # Gather medusa heads
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+
+        # Stack x and medusa residual x
+        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
+
+        # Compute lm head on x + medusa residual x
+        logits = self.lm_head(stacked_x)
+
+        # Finally, split logits from speculative logits
+        logits, speculative_logits = torch.split(
+            logits, [1, self.n_medusa_heads], dim=-2
+        )
+        # Squeeze added dimension
+        logits = logits.squeeze(-2)
+
+        return logits, speculative_logits
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
new file mode 100644
index 00000000..503dd554
--- /dev/null
+++ b/server/text_generation_server/layers/rotary.py
@@ -0,0 +1,419 @@
+import os
+import torch
+from torch import nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    from flash_attn.layers.rotary import RotaryEmbedding
+    import rotary_emb
+elif SYSTEM == "rocm":
+    from vllm import pos_encoding_ops
+
+
+def _create_inv_freq(dim, base, device):
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+    )
+    return inv_freq
+
+
+def _get_rope_config(config):
+    if os.getenv("ROPE_SCALING", None) is not None:
+        rope_scaling = {
+            "type": os.environ["ROPE_SCALING"],
+            "factor": float(os.environ["ROPE_FACTOR"]),
+        }
+        return rope_scaling
+    return getattr(config, "rope_scaling", None)
+
+
+class PositionRotaryEmbedding(nn.Module):
+    def __init__(self, inv_freq, scaling_factor):
+        super().__init__()
+        self.inv_freq = inv_freq
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.scaling_factor = scaling_factor
+        self.dynamic_args = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            rotary_dim = cos.shape[-1]
+            q1 = query[..., :rotary_dim]
+            q2 = query[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., :rotary_dim]
+            k2 = key[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
+        elif SYSTEM == "xpu":
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), True
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+    @classmethod
+    def static(cls, config, dim, base, device):
+        inv_freq = _create_inv_freq(dim, base, device)
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                scaling_factor = rope_scaling["factor"]
+                return DynamicPositionRotaryEmbedding(
+                    dim=dim,
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                scaling_factor = rope_scaling["factor"]
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            elif rope_scaling["type"] == "su":
+                short_factor = torch.tensor(
+                    rope_scaling["short_factor"], dtype=torch.float32, device=device
+                )
+                short_inv_freq = 1.0 / (
+                    short_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+                long_factor = torch.tensor(
+                    rope_scaling["long_factor"], dtype=torch.float32, device=device
+                )
+                long_inv_freq = 1.0 / (
+                    long_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+
+                original_max_position_embeddings = (
+                    config.original_max_position_embeddings
+                )
+                max_position_embeddings = config.max_position_embeddings
+                if max_position_embeddings <= original_max_position_embeddings:
+                    scaling_factor = 1.0
+                else:
+                    scale = max_position_embeddings / original_max_position_embeddings
+                    scaling_factor = math.sqrt(
+                        1 + math.log(scale) / math.log(original_max_position_embeddings)
+                    )
+
+                return SuRotaryEmbedding(
+                    short_inv_freq=short_inv_freq,
+                    long_inv_freq=long_inv_freq,
+                    scaling_factor=scaling_factor,
+                    original_max_position_embeddings=original_max_position_embeddings,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    @classmethod
+    def load(cls, config, prefix, weights):
+        # XXX: Always load this in float32 !
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
+        weights.dtype = dtype
+
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling["factor"]
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                return DynamicPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
+        """
+        Return cos and sin for the asked position ids
+        """
+        if SYSTEM == "rocm":
+            # For RoCm, we always use float cos/sin to avoid a cast.
+            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
+            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
+            dtype = torch.float32
+
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+        cos = torch.index_select(self._cos_cached, 0, position_ids)
+        sin = torch.index_select(self._sin_cached, 0, position_ids)
+
+        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
+        return cos.unsqueeze(1), sin.unsqueeze(1)
+
+
+class SuRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq,
+        long_inv_freq,
+        scaling_factor,
+        original_max_position_embeddings,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            if seqlen > self.original_max_position_embeddings:
+                inv_freq = self.long_inv_freq
+            else:
+                inv_freq = self.short_inv_freq
+            t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                newbase = self.base * (
+                    (self.scaling_factor * seqlen / self.max_position_embeddings)
+                    - (self.scaling_factor - 1)
+                ) ** (self.dim / (self.dim - 2))
+                self.inv_freq = _create_inv_freq(
+                    self.dim, newbase, self.inv_freq.device
+                )
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+# Inverse dim formula to find dim based on number of rotations
+import math
+
+
+def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_mscale(scale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings,
+        base,
+        device,
+        scaling_factor,
+        *,
+        extrapolation_factor,
+        attn_factor,
+        beta_fast,
+        beta_slow,
+    ):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = float(
+            get_mscale(self.scaling_factor) * self.attn_factor
+        )  # Get n-d magnitude scaling corrected for interpolation
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                inv_freq_extrapolation = _create_inv_freq(
+                    self.dim, self.base, self.inv_freq.device
+                )
+                freqs = 1.0 / inv_freq_extrapolation
+                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
+                low, high = find_correction_range(
+                    self.beta_fast,
+                    self.beta_slow,
+                    self.dim,
+                    self.base,
+                    self.max_position_embeddings,
+                )
+                inv_freq_mask = (
+                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_mask)
+                    + inv_freq_extrapolation * inv_freq_mask
+                )
+
+                self.inv_freq = inv_freq
+                self.mscale = float(
+                    get_mscale(self.scaling_factor) * self.attn_factor
+                )  # Get n-d magnitude scaling corrected for interpolation
+
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py
new file mode 100644
index 00000000..663f8c2e
--- /dev/null
+++ b/server/text_generation_server/layers/speculative.py
@@ -0,0 +1,35 @@
+import torch
+from typing import Tuple, Optional
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+
+
+class SpeculativeHead(torch.nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        use_medusa = config.use_medusa
+        if use_medusa:
+            lm_head = None
+            try:
+                medusa = MedusaHeadV1.load(config, prefix, weights)
+            except:
+                medusa = MedusaHeadV2(config, prefix, weights)
+        else:
+            lm_head = TensorParallelHead.load(config, prefix, weights)
+            medusa = None
+        return SpeculativeHead(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.medusa is not None:
+            return self.medusa(input)
+
+        assert self.head is not None
+        logits = self.head(input)
+        return logits, None
diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py
new file mode 100644
index 00000000..34b9c51e
--- /dev/null
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -0,0 +1,188 @@
+import torch
+from torch.nn import functional as F
+from typing import List
+from text_generation_server.layers.linear import get_linear, FastLinear
+
+
+class SuperLayer(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+
+    def forward(self, x):
+        return self.linear.forward(x)
+
+
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
+        if config.quantize in ["gptq", "awq", "eetq"]:
+            quantize = None
+        else:
+            quantize = config.quantize
+        return TensorParallelHead(
+            get_linear(weight, bias=None, quantize=quantize),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+
+            torch.mm(input, self.linear.weight.T, out=local_out)
+
+            torch.distributed.all_gather_into_tensor(
+                world_out, gather_input, group=self.process_group
+            )
+
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+
+        output = super().forward(input)
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+
+
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_gate_up(
+            prefix, quantize=config.quantize
+        )
+        if bias:
+            raise NotImplementedError("packed_gate_up only implemented without bias")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_qkv(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        return cls.load_multi(config, [prefix], weights, bias, dim=0)
+
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        weight = weights.get_multi_weights_col(
+            prefixes, quantize=config.quantize, dim=dim
+        )
+
+        if bias:
+            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+            bias = torch.cat(b, dim=dim)
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias, config.quantize),
+            process_group=weights.process_group,
+        )
+
+    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1 and reduce:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+class TensorParallelEmbedding(torch.nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+
+        process_group = weights.process_group
+
+        world_size = process_group.size()
+        rank = process_group.rank()
+
+        block_size = (num_embeddings + world_size - 1) // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = weight.shape[
+            0
+        ]  # Usually block_size, might be less in non even vocab_size.
+        self.process_group = weights.process_group
+        self.reduce = reduce
+
+        """Additional 0 entry used for masking"""
+        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+        return out
diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
index 4c65e2dd..c7705fe8 100644
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@@ -2,7 +2,7 @@ import math
 import torch
 
 from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 BLOCK_SIZE: int = 16
 # Will be set in warmup
@@ -25,7 +25,7 @@ class CacheManager:
         self.repeat_slots = repeat_slots
 
         element_size = torch.tensor([], dtype=dtype).element_size()
-        if IS_XPU_SYSTEM:
+        if SYSTEM == "xpu":
             x = 1
         else:
             x = self.block_size // element_size
diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index c8f02bca..0d8a1b59 100644
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import BloomConfig, PreTrainedModel
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py
index c4917733..56618bf1 100644
--- a/server/text_generation_server/models/custom_modeling/clip.py
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@@ -15,7 +15,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index 56d9a966..8c423eaf 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -26,18 +26,22 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM
-from text_generation_server.utils.layers import (
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
     import dropout_layer_norm
 else:
     dropout_layer_norm = None
@@ -52,7 +56,7 @@ class CohereRotary(PositionRotaryEmbedding):
         sin: torch.Tensor,
     ):
         # Such controlflows may add some overhead.
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             import rotary_emb
 
             q1 = query[..., ::2]
@@ -64,7 +68,7 @@ class CohereRotary(PositionRotaryEmbedding):
             k2 = key[..., 1::2]
 
             rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import pos_encoding_ops
 
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
@@ -90,7 +94,7 @@ class CohereLayerNorm(nn.Module):
         self.eps = eps
 
     def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
+        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
             hidden_states = hidden_states.reshape(
                 -1, self.weight.shape[0], self.weight.shape[1]
             )
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index d0978bef..9d652b67 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -21,21 +21,26 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
 from loguru import logger
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
     from vllm.model_executor.layers.fused_moe import fused_moe
+
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
-    FastLayerNorm,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 from text_generation_server.utils.log import log_once
 
 
@@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights):
 
         bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
 
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers import HAS_EXLLAMA
 
         use_exllama = (
             bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
@@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights):
             log_once(
                 logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
             )
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conveersion_utils import (
                 fast_awq_to_gptq,
             )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
index bd7596db..43b90bdd 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 6fa85d4e..a7969494 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index c2445cda..3e13c26d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index 3f6c8e03..be2d6c45 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -24,9 +24,9 @@ import torch.distributed
 import numpy as np
 
 from torch import nn
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
     from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
@@ -34,16 +34,20 @@ from typing import Optional, List, Tuple
 from loguru import logger
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
-    FastRMSNorm,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 class MixtralConfig(PretrainedConfig):
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index ee062d3d..d45cab2e 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -29,15 +29,19 @@ from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
-    FastLayerNorm,
-    PositionRotaryEmbedding,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 def load_row(config, prefix: str, weights, bias: bool):
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index cfe447a7..f2efb538 100644
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -7,15 +7,19 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 class PhiConfig(PretrainedConfig):
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 94023b33..3a6d2db5 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -6,13 +6,15 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index a9127d1f..52ea3ae1 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -8,15 +8,19 @@ from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
-    FastLayerNorm,
-    PositionRotaryEmbedding,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 def load_row(config, prefix: str, weights, bias: bool):
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index bbb603a7..d2f6d9af 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -6,14 +6,16 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     SpeculativeHead,
     TensorParallelEmbedding,
-    FastLayerNorm,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 
 
 def load_multi_mqa(
@@ -80,13 +82,13 @@ def _load_multi_mqa_gptq(
             g_idx = g_idx.to(device=weights.device)
         elif quant_method == "awq":
             g_idx = None
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conversion_utils import (
                 fast_awq_to_gptq,
             )
 
             qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
 
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
 
         use_exllama = HAS_EXLLAMA
         weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index ed77af78..3e2ce4f9 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -27,15 +27,19 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
     SpeculativeHead,
     get_linear,
-    FastRMSNorm,
+)
+from text_generation_server.layers.layernorm import (
     FastLayerNorm,
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
 )
 
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index cb2ee7db..935f049b 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -29,7 +29,7 @@ from text_generation_server.models.custom_modeling.vlm import (
 )
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index ee4cdb08..ec3f900b 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -47,20 +47,22 @@ from text_generation_server.models.custom_modeling.idefics_vision import (
 from text_generation_server.models.custom_modeling.idefics_perceiver import (
     IdeficsPerceiverResampler,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
     SpeculativeHead,
-    PositionRotaryEmbedding,
     FastLinear,
 )
-from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.import_utils import SYSTEM
 
-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
     import dropout_layer_norm
-elif IS_ROCM_SYSTEM:
+elif SYSTEM == "rocm":
     from vllm import layernorm_ops
+else:
+    raise RuntimeError(f"Unsupported system {SYSTEM}")
 
 
 @dataclass
@@ -373,7 +375,7 @@ class IdeficsRMSNorm(nn.Module):
                 hidden_states = hidden_states.to(self.weight.dtype)
 
             return self.weight * hidden_states
-        elif IS_CUDA_SYSTEM:
+        elif SYSTEM == "cuda":
             # faster post attention rms norm
             unwrap = False
             if len(hidden_states.shape) > 2:
@@ -405,7 +407,7 @@ class IdeficsRMSNorm(nn.Module):
                 normed_hidden_states = normed_hidden_states.view(*shape)
 
             return normed_hidden_states
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
             if residual is not None:
                 hidden_states += residual
diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
index 477d4d70..af44490b 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -41,7 +41,7 @@ from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
 )
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index c521dd0a..30c5997f 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -28,7 +28,7 @@ from transformers.utils import (
     ModelOutput,
     logging,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index 0d93791f..a049f756 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -27,7 +27,7 @@ from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
     load_vision_model,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
 )
diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
index c58a617f..293051c2 100644
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -8,12 +8,12 @@ from typing import Optional, Tuple, Any
 from transformers.configuration_utils import PretrainedConfig
 import torch.nn.functional as F
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     SpeculativeHead,
     TensorParallelEmbedding,
-    FastRMSNorm,
     FastLinear,
 )
+from text_generation_server.layers.layernorm import FastRMSNorm
 
 from einops import rearrange
 from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
index 9b0f8b92..f7981bf5 100644
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -17,7 +17,7 @@ from transformers.modeling_outputs import (
 )
 from einops import rearrange
 from packaging import version
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index f060ec0e..fcad32fa 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -40,7 +40,7 @@ from transformers.modeling_outputs import (
 from transformers.modeling_utils import PreTrainedModel
 from transformers import GPTNeoXConfig
 from loguru import logger
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index 7a5cf917..83d62dea 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -27,7 +27,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers import OPTConfig
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py
index 1571f9fd..04b470eb 100644
--- a/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -9,7 +9,7 @@ from typing import Optional, List, Tuple, Any
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index 2773fb15..0b899fba 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -38,7 +38,7 @@ from transformers.utils import (
     is_torch_fx_proxy,
 )
 from transformers import T5Config
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index a6d0204f..f567bea9 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -12,7 +12,6 @@ from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import PreTrainedTokenizerBase
 from typing import Optional, Tuple, List, Type, Dict
-
 from text_generation_server.models import Model
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.speculate import get_speculate
@@ -32,13 +31,14 @@ from text_generation_server.models.globals import MEM_POOL, CUDA_GRAPHS
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
 
-tracer = trace.get_tracer(__name__)
 from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
+    empty_cache,
+    synchronize,
+    get_free_memory,
 )
 
+tracer = trace.get_tracer(__name__)
+
 
 @dataclass
 class FlashCausalLMBatch(Batch):
@@ -757,10 +757,8 @@ class FlashCausalLM(Model):
 
     def warmup(self, batch: FlashCausalLMBatch):
         # The warmup batch is the biggest batch we could ever receive
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            torch.cuda.empty_cache()
-        elif IS_XPU_SYSTEM:
-            torch.xpu.empty_cache()
+        empty_cache()
+
         try:
             cache_manager = set_cache_manager(
                 batch.blocks,
@@ -780,10 +778,7 @@ class FlashCausalLM(Model):
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            torch.cuda.synchronize(self.device)
-        elif IS_XPU_SYSTEM:
-            torch.xpu.synchronize(self.device)
+        synchronize(self.device)
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
@@ -791,20 +786,7 @@ class FlashCausalLM(Model):
         cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
         total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
 
-        if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-            total_free_memory, _ = torch.cuda.mem_get_info(self.device)
-            total_gpu_memory = torch.cuda.get_device_properties(
-                self.device
-            ).total_memory
-
-            free_memory = max(
-                0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
-            )
-        elif IS_XPU_SYSTEM:
-            total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory
-            free_memory = int(total_gpu_memory * 0.5)
-        else:
-            raise NotImplementedError("FlashModel is only available on GPU")
+        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
 
         num_blocks = (
             # Leave 5% for some wiggle room
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 609a188d..8ea70713 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -18,7 +18,7 @@ from text_generation_server.utils import (
 
 tracer = trace.get_tracer(__name__)
 
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 
 class FlashLlama(FlashCausalLM):
@@ -35,7 +35,7 @@ class FlashLlama(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
index 85e93543..48304ad8 100644
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -33,7 +33,7 @@ tracer = trace.get_tracer(__name__)
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
 SLIDING_WINDOW_BLOCKS: Optional[int] = None
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 
@@ -322,7 +322,7 @@ class BaseFlashMistral(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index f82e27db..1119bdae 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -14,7 +14,7 @@ from text_generation_server.utils import (
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -33,7 +33,7 @@ class FlashNeoXSharded(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
index ccf38a0c..33298e1a 100644
--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -15,7 +15,7 @@ from text_generation_server.utils import (
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -34,7 +34,7 @@ class FlashRWSharded(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index e66f1bf8..66698a3a 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -18,7 +18,7 @@ from text_generation_server.utils import (
     Weights,
 )
 
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -37,7 +37,7 @@ class FlashSantacoderSharded(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif IS_XPU_SYSTEM:
+        elif SYSTEM == "xpu":
             device = torch.device(f"xpu:{rank}")
             dtype = torch.float16 if dtype is None else dtype
         else:
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 158966e3..9d0571a6 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -85,7 +85,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
                 # When using GPTQ, Exllama kernels need some global kernels
                 # For which we have the finale shapes only after the model has loaded
                 # This will allocate those buffers.
-                from text_generation_server.utils.layers import (
+                from text_generation_server.layers.gptq import (
                     create_exllama_buffers,
                     set_device,
                 )
diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
index 583a8f91..0830656d 100644
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
@@ -2,13 +2,8 @@ import os
 import torch
 
 from loguru import logger
-import math
 
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
+from text_generation_server.utils.import_utils import SYSTEM
 
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
@@ -16,83 +11,22 @@ HAS_FLASH_ATTN = True
 HAS_FLASH_ATTN_V2_CUDA = False
 HAS_FLASH_ATTN_V2_ROCM = False
 
-if IS_XPU_SYSTEM:
+if SYSTEM == "xpu":
     import intel_extension_for_pytorch as ipex
 
-if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-    if not torch.cuda.is_available():
-        raise ImportError("CUDA is not available")
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
 
-    major, minor = torch.cuda.get_device_capability()
-    is_sm75 = major == 7 and minor == 5
-    is_sm8x = major == 8 and minor >= 0
-    is_sm90 = major == 9 and minor == 0
-
-    HAS_FLASH_ATTN = False
-    HAS_FLASH_ATTN_V2_CUDA = False
-    HAS_FLASH_ATTN_V2_ROCM = False
-    try:
-        try:
-            import flash_attn_2_cuda
-        except ImportError:
-            architecture_suffix = ""
-            if IS_CUDA_SYSTEM:
-                architecture_suffix = "-cuda"
-            elif IS_ROCM_SYSTEM:
-                architecture_suffix = "-rocm"
-            raise ImportError(
-                "Flash Attention V2 is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
-            )
-        if not (is_sm8x or is_sm90):
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported for "
-                "Flash Attention V2"
-            )
-        HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
-        HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
-    except ImportError as e:
-        try:
-            import flash_attn_cuda
-        except ImportError:
-            raise ImportError(
-                "Flash Attention is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                "or install flash attention with `cd server && make install install-flash-attention`"
-            ) from e
-
-        if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported"
-            ) from e
-        elif IS_ROCM_SYSTEM:
-            for idx in range(torch.cuda.device_count()):
-                if "MI210" not in torch.cuda.get_device_name(
-                    idx
-                ) and "MI250" not in torch.cuda.get_device_name(idx):
-                    raise ImportError(
-                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
-                    )
-
-        logger.warning(f"Unable to use Flash Attention V2: {e}")
-        HAS_FLASH_ATTN = True
-
-
-def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-    window_size_left=-1,
-):
-    if window_size_left <= 0 and window_size_left != -1:
-        raise ValueError("`window_size_left` must be > 0 or -1")
-
-    if IS_XPU_SYSTEM:
         if window_size_left != -1:
             raise ValueError(
                 f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
@@ -114,7 +48,77 @@ def attention(
             None,
         )
 
-    if HAS_FLASH_ATTN_V2_CUDA:
+
+if SYSTEM in {"cuda", "rocm"}:
+    if not torch.cuda.is_available():
+        raise ImportError("CUDA is not available")
+
+    major, minor = torch.cuda.get_device_capability()
+    is_sm75 = major == 7 and minor == 5
+    is_sm8x = major == 8 and minor >= 0
+    is_sm90 = major == 9 and minor == 0
+
+    HAS_FLASH_ATTN = False
+    HAS_FLASH_ATTN_V2_CUDA = False
+    HAS_FLASH_ATTN_V2_ROCM = False
+    try:
+        try:
+            import flash_attn_2_cuda
+        except ImportError:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(
+                "Flash Attention V2 is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            )
+        if not (is_sm8x or is_sm90):
+            raise ImportError(
+                f"GPU with CUDA capability {major} {minor} is not supported for "
+                "Flash Attention V2"
+            )
+        HAS_FLASH_ATTN_V2_CUDA = SYSTEM == "cuda"
+        HAS_FLASH_ATTN_V2_ROCM = SYSTEM == "rocm"
+    except ImportError as e:
+        try:
+            import flash_attn_cuda
+        except ImportError:
+            raise ImportError(
+                "Flash Attention is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                "or install flash attention with `cd server && make install install-flash-attention`"
+            ) from e
+
+        if SYSTEM == "cuda" and not (is_sm75 or is_sm8x or is_sm90):
+            raise ImportError(
+                f"GPU with CUDA capability {major} {minor} is not supported"
+            ) from e
+        elif SYSTEM == "rocm":
+            for idx in range(torch.cuda.device_count()):
+                if "MI210" not in torch.cuda.get_device_name(
+                    idx
+                ) and "MI250" not in torch.cuda.get_device_name(idx):
+                    raise ImportError(
+                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
+                    )
+
+        logger.warning(f"Unable to use Flash Attention V2: {e}")
+        HAS_FLASH_ATTN = True
+
+
+if HAS_FLASH_ATTN_V2_CUDA:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
         return flash_attn_2_cuda.varlen_fwd(
             q,
             k,
@@ -136,7 +140,21 @@ def attention(
             False,
             None,
         )
-    elif HAS_FLASH_ATTN_V2_ROCM:
+
+elif HAS_FLASH_ATTN_V2_ROCM:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
         if window_size_left != -1:
             raise ValueError(
                 f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
@@ -159,7 +177,19 @@ def attention(
             False,
             None,
         )
-    elif HAS_FLASH_ATTN:
+
+elif HAS_FLASH_ATTN:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
         if window_size_left != -1:
             raise NotImplementedError(
                 "window_size_left is only available with flash attn v2"
@@ -209,4 +239,5 @@ def attention(
             None,
         )
 
+else:
     raise NotImplementedError("flash attention is not installed")
diff --git a/server/text_generation_server/utils/gptq/quant_linear.py b/server/text_generation_server/utils/gptq/quant_linear.py
deleted file mode 100644
index a832f755..00000000
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ /dev/null
@@ -1,359 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-try:
-    import triton
-    import triton.language as tl
-    from . import custom_autotune
-
-    # code based https://github.com/fpgaminer/GPTQ-triton
-    @custom_autotune.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 64,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=4,
-            ),
-        ],
-        key=["M", "N", "K"],
-        nearest_power_of_two=True,
-        prune_configs_by={
-            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-            "perf_model": None,
-            "top_k": None,
-        },
-    )
-    @triton.jit
-    def matmul_248_kernel(
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        scales_ptr,
-        zeros_ptr,
-        g_ptr,
-        M,
-        N,
-        K,
-        bits,
-        maxq,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_scales,
-        stride_zeros,
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """
-        Compute the matrix multiplication C = A x B.
-        A is of shape (M, K) float16
-        B is of shape (K//8, N) int32
-        C is of shape (M, N) float16
-        scales is of shape (G, N) float16
-        zeros is of shape (G, N) float16
-        g_ptr is of shape (K) int32
-        """
-        infearure_per_bits = 32 // bits
-
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (
-            offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-        )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        a_mask = offs_am[:, None] < M
-        # b_ptrs is set up such that it repeats elements along the K axis 8 times
-        b_ptrs = b_ptr + (
-            (offs_k[:, None] // infearure_per_bits) * stride_bk
-            + offs_bn[None, :] * stride_bn
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-        g_ptrs = g_ptr + offs_k
-        # shifter is used to extract the N bits of each element in the 32-bit word from B
-        scales_ptrs = scales_ptr + offs_bn[None, :]
-        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-        shifter = (offs_k % infearure_per_bits) * bits
-        zeros_shifter = (offs_bn % infearure_per_bits) * bits
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-        for k in range(0, num_pid_k):
-            g_idx = tl.load(g_ptrs)
-
-            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-            scales = tl.load(
-                scales_ptrs + g_idx[:, None] * stride_scales
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = tl.load(
-                zeros_ptrs + g_idx[:, None] * stride_zeros
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-            zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = (zeros + 1) & maxq  # eventually avoid overflow
-
-            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-            # Now we need to unpack b (which is N-bit values) into 32-bit values
-            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-            b = (b - zeros) * scales  # Scale and shift
-
-            accumulator += tl.dot(a, b)
-            a_ptrs += BLOCK_SIZE_K
-            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-            g_ptrs += BLOCK_SIZE_K
-
-        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-        tl.store(c_ptrs, accumulator, mask=c_mask)
-
-except:
-    print("triton not installed.")
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index db205f4d..f54987eb 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -10,6 +10,41 @@ def is_xpu_available():
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
-IS_ROCM_SYSTEM = torch.version.hip is not None
-IS_CUDA_SYSTEM = torch.version.cuda is not None
-IS_XPU_SYSTEM = is_xpu_available()
+def get_cuda_free_memory(device, memory_fraction):
+    total_free_memory, _ = torch.cuda.mem_get_info(device)
+    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
+    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
+    return free_memory
+
+
+def get_xpu_free_memory(device):
+    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
+    free_memory = int(total_gpu_memory * 0.5)
+    return free_memory
+
+
+SYSTEM = None
+if torch.version.hip is not None:
+    SYSTEM = "rocm"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    SYSTEM = "cuda"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif is_xpu_available():
+    SYSTEM = "xpu"
+    empty_cache = torch.xpu.empty_cache
+    synchronize = torch.xpu.synchronize
+    get_free_memory = get_xpu_free_memory
+else:
+    SYSTEM = "cpu"
+
+    def noop(*args, **kwargs):
+        pass
+
+    empty_cache = noop
+    synchronize = noop
+    get_free_memory = noop
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
deleted file mode 100644
index 7d339fe5..00000000
--- a/server/text_generation_server/utils/layers.py
+++ /dev/null
@@ -1,1374 +0,0 @@
-import os
-import torch
-import torch.distributed
-
-from torch import nn
-from torch.nn import functional as F
-from typing import List, Tuple, Optional
-from loguru import logger
-from functools import lru_cache
-
-from text_generation_server.utils.speculate import get_speculate
-
-HAS_BITS_AND_BYTES = True
-try:
-    import bitsandbytes as bnb
-    from bitsandbytes.nn import Int8Params, Params4bit
-except ImportError:
-    HAS_BITS_AND_BYTES = False
-
-from accelerate import init_empty_weights
-
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
-
-if IS_XPU_SYSTEM:
-    import intel_extension_for_pytorch as ipex
-
-HAS_AWQ = True
-try:
-    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
-except ImportError:
-    HAS_AWQ = False
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-except Exception:
-    major = 1
-
-HAS_EXLLAMA = False
-CAN_EXLLAMA = major >= 8 or IS_ROCM_SYSTEM
-V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
-
-if os.getenv("DISABLE_EXLLAMA") == "True":
-    HAS_EXLLAMA = False
-elif CAN_EXLLAMA:
-    try:
-        if V2:
-            from text_generation_server.utils.gptq.exllamav2 import (
-                QuantLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "2"
-        else:
-            from text_generation_server.utils.gptq.exllama import (
-                Ex4bitLinear as ExllamaQuantLinear,
-                create_exllama_buffers,
-                set_device,
-            )
-
-            HAS_EXLLAMA = "1"
-
-    except ImportError:
-        pass
-
-HAS_EETQ = False
-try:
-    from EETQ import quant_weights, w8_a16_gemm
-
-    HAS_EETQ = True
-except ImportError:
-    pass
-
-
-# Monkey patching
-@classmethod
-def load_layer_norm(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = nn.Parameter(bias)
-    return ln
-
-
-@classmethod
-def load_layer_norm_no_bias(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = None
-    return ln
-
-
-@classmethod
-def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = nn.Parameter(bias)
-    return conv2d
-
-
-@classmethod
-def load_conv2d_no_bias(
-    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
-):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        conv2d = cls(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-        )
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = None
-    return conv2d
-
-
-torch.nn.Conv2d.load = load_conv2d
-torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
-torch.nn.LayerNorm.load = load_layer_norm
-torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
-
-
-class FastLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(weight)
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.bias = None
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_tensor(f"{prefix}.weight")
-        if bias:
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(weight, bias)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, self.weight, self.bias)
-
-
-class EETQLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        device = weight.device
-        if weight.dtype != torch.float16:
-            weight = weight.to(dtype=torch.float16)
-        weight = torch.t(weight).contiguous().cpu()
-        weight, scale = quant_weights(weight, torch.int8, False)
-
-        self.weight = weight.cuda(device)
-        self.scale = scale.cuda(device)
-        self.bias = bias.cuda(device) if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = w8_a16_gemm(input, self.weight, self.scale)
-        output = output + self.bias if self.bias is not None else output
-        return output
-
-
-def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
-    device = weight.device
-    # weight, scale = quant_weights(weight, torch.int8, False)
-    finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
-    return qweight, scale
-
-
-class Fp8Linear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.dtype = weight.dtype
-        self.qweight, self.scale = fp8_quantize(weight)
-
-        self.bias = bias if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        qinput, scale = fp8_quantize(input)
-        output, _ = torch._scaled_mm(
-            qinput,
-            self.qweight.t(),
-            out_dtype=self.dtype,
-            scale_a=scale,
-            scale_b=self.scale,
-            bias=self.bias,
-        )
-        return output
-
-
-class Linear8bitLt(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=0.0,
-        index=None,
-    ):
-        super().__init__()
-        assert (
-            not memory_efficient_backward
-        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        # Necessary for stacked layers
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(
-            weight.data,
-            has_fp16_weights=has_fp16_weights,
-            requires_grad=has_fp16_weights,
-        )
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x: torch.Tensor):
-        self.state.is_training = self.training
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
-
-        if not self.state.has_fp16_weights:
-            if self.state.CB is not None and self.state.CxB is not None:
-                # we converted 8-bit row major to turing/ampere format in the first inference pass
-                # we no longer need the row-major weight
-                del self.state.CB
-                self.weight.data = self.state.CxB
-        return out
-
-
-class Linear4bit(nn.Module):
-    def __init__(self, weight, bias, quant_type):
-        super().__init__()
-        self.weight = Params4bit(
-            weight.data,
-            requires_grad=False,
-            compress_statistics=True,
-            quant_type=quant_type,
-        )
-        self.compute_dtype = None
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def forward(self, x: torch.Tensor):
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        if getattr(self.weight, "quant_state", None) is None:
-            print(
-                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
-            )
-        inp_dtype = x.dtype
-        if self.compute_dtype is not None:
-            x = x.to(self.compute_dtype)
-
-        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(
-            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
-        )
-
-        out = out.to(inp_dtype)
-
-        return out
-
-
-@lru_cache(1)
-def warn_deprecate_bnb():
-    logger.warning(
-        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
-    )
-
-
-def get_linear(weight, bias, quantize):
-    if quantize is None:
-        linear = FastLinear(weight, bias)
-    elif quantize == "eetq":
-        if HAS_EETQ:
-            linear = EETQLinear(weight, bias)
-        else:
-            raise ImportError(
-                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
-            )
-    elif quantize == "fp8":
-        linear = Fp8Linear(weight, bias)
-    elif quantize == "bitsandbytes":
-        warn_deprecate_bnb()
-        linear = Linear8bitLt(
-            weight,
-            bias,
-            has_fp16_weights=False,
-            threshold=6.0,
-        )
-        if bias is not None:
-            linear.bias = nn.Parameter(bias)
-    elif quantize == "bitsandbytes-fp4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="fp4",
-        )
-    elif quantize == "bitsandbytes-nf4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="nf4",
-        )
-    elif quantize == "gptq":
-        try:
-            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `gptq` compatible, loader needs to be updated."
-            )
-
-        if use_exllama:
-            linear = ExllamaQuantLinear(
-                qweight, qzeros, scales, g_idx, bias, bits, groupsize
-            )
-        else:
-            linear = QuantLinear(
-                qweight,
-                qzeros,
-                scales,
-                g_idx,
-                bias,
-                bits,
-                groupsize,
-            )
-    elif quantize == "awq":
-        try:
-            qweight, qzeros, scales, _, bits, groupsize, _ = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `awq` compatible, loader needs to be updated."
-            )
-        if IS_ROCM_SYSTEM:
-            raise NotImplementedError(
-                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
-                "to use Exllama/GPTQ kernels for AWQ inference."
-            )
-        if not HAS_AWQ:
-            raise NotImplementedError(
-                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
-            )
-        linear = WQLinear(
-            w_bit=bits,
-            group_size=groupsize,
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            bias=bias is not None,
-        )
-    else:
-        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
-    return linear
-
-
-class SuperLayer(nn.Module):
-    def __init__(self, linear):
-        super().__init__()
-        self.linear = linear
-
-    def forward(self, x):
-        return self.linear.forward(x)
-
-
-class ResBlock(torch.nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.linear = FastLinear.load(
-            config, prefix=f"{prefix}.linear", weights=weights, bias=True
-        )
-        self.act = torch.nn.SiLU()
-
-    def forward(self, x):
-        return x + self.act(self.linear(x))
-
-
-class MedusaModel(torch.nn.Module):
-    def __init__(self, config, medusa_config, weights):
-        super().__init__()
-        self.heads = torch.nn.ModuleList(
-            [
-                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
-                for i in range(get_speculate())
-            ]
-        )
-
-    def forward(self, x):
-        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
-        return speculative_logits
-
-
-class MedusaHead(torch.nn.Module):
-    def __init__(self, config, medusa_config, prefix, weights):
-        super().__init__()
-        self.blocks = torch.nn.ModuleList(
-            [
-                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
-                for i in range(medusa_config["medusa_num_layers"])
-            ]
-        )
-        n = len(self.blocks)
-        self.out = FastLinear.load(
-            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
-        )
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-        x = self.out(x)
-        return x
-
-
-class MedusaHeadV1(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.lm_head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        medusa = MedusaModel(config, medusa_config, weights)
-        lm_head = TensorParallelHead.load(config, prefix, weights)
-        return MedusaHeadV1(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        logits = self.lm_head(input)
-        # If we have too many tokens, we skip speculative logits
-        if input.shape[0] > 128:
-            return logits, None
-
-        speculative_logits = self.medusa(input)
-        return logits, speculative_logits
-
-
-class MedusaHeadV2(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        from pathlib import Path
-        from safetensors import safe_open
-        import json
-
-        use_medusa = config.use_medusa
-
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
-
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
-
-        self.n_medusa_heads = get_speculate()
-
-        assert medusa_config["medusa_num_layers"] == 1
-        self.linear = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
-            dim=0,
-            weights=weights,
-            bias=True,
-        )
-        self.process_group = weights.process_group
-        self.world_size = self.process_group.size()
-        self.rank = self.process_group.rank()
-
-        self.act = torch.nn.SiLU()
-
-        self.lm_head = TensorParallelHead.load(config, prefix, weights)
-
-    def forward(self, x):
-        # If we have too many tokens, we skip speculative logits
-        if x.shape[0] > 128:
-            logits = self.lm_head(x)
-            return logits, None
-
-        size = x.shape[-1]
-        block_size = (size + self.world_size - 1) // self.world_size
-        start = self.rank * block_size
-        stop = (self.rank + 1) * block_size
-
-        x_block = x[:, start:stop]
-
-        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
-        medusa_res = self.act(self.linear(x)).reshape(
-            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
-        )
-
-        # Apply all residual medusa heads
-        output = x[:, start:stop].unsqueeze(-2) + medusa_res
-
-        # Gather medusa heads
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-
-        # Stack x and medusa residual x
-        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
-
-        # Compute lm head on x + medusa residual x
-        logits = self.lm_head(stacked_x)
-
-        # Finally, split logits from speculative logits
-        logits, speculative_logits = torch.split(
-            logits, [1, self.n_medusa_heads], dim=-2
-        )
-        # Squeeze added dimension
-        logits = logits.squeeze(-2)
-
-        return logits, speculative_logits
-
-
-class SpeculativeHead(nn.Module):
-    def __init__(self, lm_head, medusa):
-        super().__init__()
-        self.head = lm_head
-        self.medusa = medusa
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        use_medusa = config.use_medusa
-        if use_medusa:
-            lm_head = None
-            try:
-                medusa = MedusaHeadV1.load(config, prefix, weights)
-            except:
-                medusa = MedusaHeadV2(config, prefix, weights)
-        else:
-            lm_head = TensorParallelHead.load(config, prefix, weights)
-            medusa = None
-        return SpeculativeHead(lm_head, medusa)
-
-    def forward(
-        self, input: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.medusa is not None:
-            return self.medusa(input)
-
-        assert self.head is not None
-        logits = self.head(input)
-        return logits, None
-
-
-class TensorParallelHead(SuperLayer):
-    def __init__(self, linear, process_group, should_gather: bool):
-        super().__init__(linear)
-        self.process_group = process_group
-        self.should_gather = should_gather
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        if weights.process_group.size() > 1:
-            try:
-                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
-                should_gather = True
-            except AssertionError:
-                # If the vocab size is not divisible by number of shards
-                # just load the entire thing.
-                weight = weights.get_tensor(f"{prefix}.weight")
-                should_gather = False
-        else:
-            weight = weights.get_tensor(f"{prefix}.weight")
-            should_gather = False
-
-        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
-        if config.quantize in ["gptq", "awq", "eetq"]:
-            quantize = None
-        else:
-            quantize = config.quantize
-        return TensorParallelHead(
-            get_linear(weight, bias=None, quantize=quantize),
-            process_group=weights.process_group,
-            should_gather=should_gather,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if not self.should_gather:
-            return super().forward(input)
-
-        world_size = self.process_group.size()
-        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
-            out_dim = self.linear.weight.shape[0]
-
-            if input.shape[0] == 1:
-                world_out = input.new_empty(1, out_dim * world_size)
-                local_out = input.new_empty(1, out_dim)
-                gather_input = local_out
-            else:
-                world_out = input.new_empty(out_dim * world_size, input.shape[0])
-                gather_input = input.new_empty(out_dim, input.shape[0])
-                local_out = gather_input.T
-
-            torch.mm(input, self.linear.weight.T, out=local_out)
-
-            torch.distributed.all_gather_into_tensor(
-                world_out, gather_input, group=self.process_group
-            )
-
-            if input.shape[0] == 1:
-                return world_out
-            return world_out.T
-
-        output = super().forward(input)
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-        return world_output
-
-
-class TensorParallelColumnLinear(SuperLayer):
-    @classmethod
-    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_gate_up(
-            prefix, quantize=config.quantize
-        )
-        if bias:
-            raise NotImplementedError("packed_gate_up only implemented without bias")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load_qkv(cls, config, prefix: str, weights, bias: bool):
-        """Specific method when the QKV was joined after the fact"""
-        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
-        if bias:
-            raise NotImplementedError("packed_qkv only implemented for baichuan")
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        return cls.load_multi(config, [prefix], weights, bias, dim=0)
-
-    @classmethod
-    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
-        weight = weights.get_multi_weights_col(
-            prefixes, quantize=config.quantize, dim=dim
-        )
-
-        if bias:
-            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
-            bias = torch.cat(b, dim=dim)
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-
-class TensorParallelRowLinear(SuperLayer):
-    def __init__(self, linear, process_group):
-        super().__init__(linear)
-        self.process_group = process_group
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
-
-        if bias and weights.process_group.rank() == 0:
-            # Rank is only on the first rank process
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(
-            get_linear(weight, bias, config.quantize),
-            process_group=weights.process_group,
-        )
-
-    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
-        out = super().forward(input)
-        if self.process_group.size() > 1 and reduce:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class TensorParallelEmbedding(nn.Module):
-    def __init__(self, prefix: str, weights, reduce=True):
-        super().__init__()
-        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
-        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
-
-        process_group = weights.process_group
-
-        world_size = process_group.size()
-        rank = process_group.rank()
-
-        block_size = (num_embeddings + world_size - 1) // world_size
-        self.min_id = rank * block_size
-        self.max_id = min(num_embeddings, (rank + 1) * block_size)
-        self.null_idx = weight.shape[
-            0
-        ]  # Usually block_size, might be less in non even vocab_size.
-        self.process_group = weights.process_group
-        self.reduce = reduce
-
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = torch.nn.functional.embedding(input, self.weight)
-        if self.reduce and self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-try:
-    if IS_CUDA_SYSTEM:
-        import dropout_layer_norm
-    elif IS_ROCM_SYSTEM:
-        from vllm import layernorm_ops
-    else:
-        dropout_layer_norm = None
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                res_out = hidden_states
-                out = ipex.llm.functional.add_layer_norm(
-                    residual, hidden_states, self.weight, self.bias, self.eps, True
-                )
-                if residual is not None:
-                    res_out = residual
-                return out, res_out
-            elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                return super(FastLayerNorm, self).forward(hidden_states), residual
-            else:
-                (
-                    normed_hidden_states,
-                    residual,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    self.bias,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.eps,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    False,
-                )
-                if residual is None:
-                    residual = hidden_states
-
-                return normed_hidden_states, residual
-
-    class FastRMSNorm(nn.Module):
-        def __init__(self, weight: torch.Tensor, eps: float):
-            super().__init__()
-
-            self.weight = nn.Parameter(weight)
-            self.variance_epsilon = eps
-
-        @classmethod
-        def load(cls, prefix, weights, eps=1e-6):
-            weight = weights.get_tensor(f"{prefix}.weight")
-            return cls(weight, eps)
-
-        def forward(self, hidden_states, residual=None):
-            if IS_XPU_SYSTEM:
-                residual_out = hidden_states
-                out = ipex.llm.functional.add_rms_norm(
-                    residual,
-                    hidden_states,
-                    self.weight,
-                    None,
-                    self.variance_epsilon,
-                    True,
-                )
-                if residual is not None:
-                    residual_out = residual
-                return out, residual_out
-            elif hidden_states.shape[-1] > 8192:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                hidden_states = hidden_states.to(torch.float32)
-                variance = hidden_states.pow(2).mean(-1, keepdim=True)
-                hidden_states = hidden_states * torch.rsqrt(
-                    variance + self.variance_epsilon
-                )
-
-                # convert into half-precision if necessary
-                if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                    hidden_states = hidden_states.to(self.weight.dtype)
-
-                return self.weight * hidden_states, residual
-            elif IS_CUDA_SYSTEM:
-                # faster post attention rms norm
-                (
-                    normed_hidden_states,
-                    res,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.variance_epsilon,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    True,  # Activate RMSNorm
-                )
-                if res is None:
-                    res = hidden_states
-
-                return normed_hidden_states, res
-            elif IS_ROCM_SYSTEM:
-                # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                out = torch.empty_like(hidden_states)
-                layernorm_ops.rms_norm(
-                    out,
-                    hidden_states,
-                    self.weight.data,
-                    self.variance_epsilon,
-                )
-                return out, residual
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-except ImportError:
-    pass
-
-try:
-    if IS_CUDA_SYSTEM:
-        from flash_attn.layers.rotary import RotaryEmbedding
-        import rotary_emb
-    elif IS_ROCM_SYSTEM:
-        from vllm import pos_encoding_ops
-
-    def _create_inv_freq(dim, base, device):
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
-        )
-        return inv_freq
-
-    def _get_rope_config(config):
-        if os.getenv("ROPE_SCALING", None) is not None:
-            rope_scaling = {
-                "type": os.environ["ROPE_SCALING"],
-                "factor": float(os.environ["ROPE_FACTOR"]),
-            }
-            return rope_scaling
-        return getattr(config, "rope_scaling", None)
-
-    class PositionRotaryEmbedding(nn.Module):
-        def __init__(self, inv_freq, scaling_factor):
-            super().__init__()
-            self.inv_freq = inv_freq
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.scaling_factor = scaling_factor
-            self.dynamic_args = None
-
-        def forward(
-            self,
-            query: torch.Tensor,
-            key: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-        ):
-            # Such controlflows may add some overhead.
-            if IS_CUDA_SYSTEM:
-                rotary_dim = cos.shape[-1]
-                q1 = query[..., :rotary_dim]
-                q2 = query[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-                k1 = key[..., :rotary_dim]
-                k2 = key[..., rotary_dim : 2 * rotary_dim]
-
-                rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-            elif IS_ROCM_SYSTEM:
-                # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-                # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-                head_size = query.shape[-1]
-
-                # Inplace operation, updating query and key.
-                pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
-            elif IS_XPU_SYSTEM:
-                ipex.llm.functional.rotary_embedding(
-                    query, key, sin, cos, query.size(-1), True
-                )
-            else:
-                raise ValueError(
-                    "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-                )
-
-        @classmethod
-        def static(cls, config, dim, base, device):
-            inv_freq = _create_inv_freq(dim, base, device)
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    scaling_factor = rope_scaling["factor"]
-                    return DynamicPositionRotaryEmbedding(
-                        dim=dim,
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=base,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    scaling_factor = rope_scaling["factor"]
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                elif rope_scaling["type"] == "su":
-                    short_factor = torch.tensor(
-                        rope_scaling["short_factor"], dtype=torch.float32, device=device
-                    )
-                    short_inv_freq = 1.0 / (
-                        short_factor
-                        * base
-                        ** (
-                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
-                            / dim
-                        )
-                    )
-                    long_factor = torch.tensor(
-                        rope_scaling["long_factor"], dtype=torch.float32, device=device
-                    )
-                    long_inv_freq = 1.0 / (
-                        long_factor
-                        * base
-                        ** (
-                            torch.arange(0, dim, 2, device=device, dtype=torch.float32)
-                            / dim
-                        )
-                    )
-
-                    original_max_position_embeddings = (
-                        config.original_max_position_embeddings
-                    )
-                    max_position_embeddings = config.max_position_embeddings
-                    if max_position_embeddings <= original_max_position_embeddings:
-                        scaling_factor = 1.0
-                    else:
-                        scale = (
-                            max_position_embeddings / original_max_position_embeddings
-                        )
-                        scaling_factor = math.sqrt(
-                            1
-                            + math.log(scale)
-                            / math.log(original_max_position_embeddings)
-                        )
-
-                    return SuRotaryEmbedding(
-                        short_inv_freq=short_inv_freq,
-                        long_inv_freq=long_inv_freq,
-                        scaling_factor=scaling_factor,
-                        original_max_position_embeddings=original_max_position_embeddings,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        @classmethod
-        def load(cls, config, prefix, weights):
-            # XXX: Always load this in float32 !
-            dtype = weights.dtype
-            weights.dtype = torch.float32
-            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
-            weights.dtype = dtype
-
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=config.max_position_embeddings,
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                    )
-                elif rope_scaling["type"] == "yarn":
-                    return YarnPositionRotaryEmbedding(
-                        dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling[
-                            "original_max_position_embeddings"
-                        ],
-                        base=10000.0,
-                        device=inv_freq.device,
-                        scaling_factor=scaling_factor,
-                        extrapolation_factor=1,
-                        attn_factor=1,
-                        beta_fast=32,
-                        beta_slow=1,
-                    )
-                else:
-                    raise NotImplementedError(
-                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
-                    )
-            return cls(inv_freq, scaling_factor)
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-        def get_cos_sin(
-            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
-        ):
-            """
-            Return cos and sin for the asked position ids
-            """
-            if IS_ROCM_SYSTEM:
-                # For RoCm, we always use float cos/sin to avoid a cast.
-                # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
-                # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
-                dtype = torch.float32
-
-            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-            cos = torch.index_select(self._cos_cached, 0, position_ids)
-            sin = torch.index_select(self._sin_cached, 0, position_ids)
-
-            # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
-            return cos.unsqueeze(1), sin.unsqueeze(1)
-
-    class SuRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(
-            self,
-            short_inv_freq,
-            long_inv_freq,
-            scaling_factor,
-            original_max_position_embeddings,
-        ):
-            super(PositionRotaryEmbedding, self).__init__()
-            self.short_inv_freq = short_inv_freq
-            self.long_inv_freq = long_inv_freq
-            self.scaling_factor = scaling_factor
-            self.original_max_position_embeddings = original_max_position_embeddings
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.dynamic_args = None
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                if seqlen > self.original_max_position_embeddings:
-                    inv_freq = self.long_inv_freq
-                else:
-                    inv_freq = self.short_inv_freq
-                t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    newbase = self.base * (
-                        (self.scaling_factor * seqlen / self.max_position_embeddings)
-                        - (self.scaling_factor - 1)
-                    ) ** (self.dim / (self.dim - 2))
-                    self.inv_freq = _create_inv_freq(
-                        self.dim, newbase, self.inv_freq.device
-                    )
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-    # Inverse dim formula to find dim based on number of rotations
-    import math
-
-    def find_correction_dim(
-        num_rotations, dim, base=10000, max_position_embeddings=2048
-    ):
-        return (
-            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
-        ) / (2 * math.log(base))
-
-    # Find dim range bounds based on rotations
-    def find_correction_range(
-        low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
-    ):
-        low = math.floor(
-            find_correction_dim(low_rot, dim, base, max_position_embeddings)
-        )
-        high = math.ceil(
-            find_correction_dim(high_rot, dim, base, max_position_embeddings)
-        )
-        return max(low, 0), min(high, dim - 1)  # Clamp values just in case
-
-    def linear_ramp_mask(min, max, dim):
-        if min == max:
-            max += 0.001  # Prevent singularity
-
-        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-        ramp_func = torch.clamp(linear_func, 0, 1)
-        return ramp_func
-
-    def get_mscale(scale=1):
-        if scale <= 1:
-            return 1.0
-        return 0.1 * math.log(scale) + 1.0
-
-    class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(
-            self,
-            dim,
-            max_position_embeddings,
-            base,
-            device,
-            scaling_factor,
-            *,
-            extrapolation_factor,
-            attn_factor,
-            beta_fast,
-            beta_slow,
-        ):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-            self.extrapolation_factor = extrapolation_factor
-            self.attn_factor = attn_factor
-            self.beta_fast = beta_fast
-            self.beta_slow = beta_slow
-            self.mscale = float(
-                get_mscale(self.scaling_factor) * self.attn_factor
-            )  # Get n-d magnitude scaling corrected for interpolation
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    inv_freq_extrapolation = _create_inv_freq(
-                        self.dim, self.base, self.inv_freq.device
-                    )
-                    freqs = 1.0 / inv_freq_extrapolation
-                    inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
-                    low, high = find_correction_range(
-                        self.beta_fast,
-                        self.beta_slow,
-                        self.dim,
-                        self.base,
-                        self.max_position_embeddings,
-                    )
-                    inv_freq_mask = (
-                        1
-                        - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
-                    ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
-                    inv_freq = (
-                        inv_freq_interpolation * (1 - inv_freq_mask)
-                        + inv_freq_extrapolation * inv_freq_mask
-                    )
-
-                    self.inv_freq = inv_freq
-                    self.mscale = float(
-                        get_mscale(self.scaling_factor) * self.attn_factor
-                    )  # Get n-d magnitude scaling corrected for interpolation
-
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
-
-except ImportError:
-    pass
diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py
index 62c0c893..1b31f7e7 100644
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
@@ -1,13 +1,9 @@
 import torch
-from text_generation_server.utils.import_utils import (
-    IS_CUDA_SYSTEM,
-    IS_ROCM_SYSTEM,
-    IS_XPU_SYSTEM,
-)
+from text_generation_server.utils.import_utils import SYSTEM
 
 _PARTITION_SIZE = 512
 
-if IS_XPU_SYSTEM:
+if SYSTEM == "xpu":
     import intel_extension_for_pytorch as ipex
 
 
@@ -18,17 +14,17 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slots: torch.Tensor,
 ):
-    if IS_CUDA_SYSTEM:
+    if SYSTEM == "cuda":
         from vllm._C import cache_ops
 
         cache_ops.reshape_and_cache(
             key, value, key_cache, value_cache, slots, "auto", 1.0
         )
-    elif IS_ROCM_SYSTEM:
+    elif SYSTEM == "rocm":
         from vllm import cache_ops
 
         cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
-    elif IS_XPU_SYSTEM:
+    elif SYSTEM == "xpu":
         ipex.llm.modules.PagedAttention.reshape_and_cache(
             key, value, key_cache, value_cache, slots
         )
@@ -68,7 +64,7 @@ def attention(
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    if IS_XPU_SYSTEM:
+    if SYSTEM == "xpu":
         query = query.contiguous()
         return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
             out,
@@ -91,7 +87,7 @@ def attention(
     # to parallelize.
     use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
     if use_v1:
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             from vllm._C import ops
 
             ops.paged_attention_v1(
@@ -109,7 +105,7 @@ def attention(
                 "auto",
                 1.0,
             )
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import attention_ops
 
             attention_ops.paged_attention_v1(
@@ -143,7 +139,7 @@ def attention(
         )
         max_logits = torch.empty_like(exp_sums)
 
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
             from vllm._C import ops
 
             ops.paged_attention_v2(
@@ -164,7 +160,7 @@ def attention(
                 "auto",
                 1.0,
             )
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
             from vllm import attention_ops
 
             attention_ops.paged_attention_v2(
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index da7aed1a..6af7d3fb 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -171,7 +171,7 @@ class Weights:
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )
 
@@ -227,7 +227,7 @@ class Weights:
 
             bits, groupsize, desc_act, quant_method = self._get_gptq_params()
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA
+            from text_generation_server.layers.gptq import HAS_EXLLAMA
 
             use_exllama = (
                 bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
@@ -242,7 +242,7 @@ class Weights:
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )
 
@@ -321,7 +321,7 @@ class Weights:
                         # it would require to reorder input activations that are split unto several GPUs
                         use_exllama = False
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
+            from text_generation_server.layers.gptq import HAS_EXLLAMA, CAN_EXLLAMA
 
             if use_exllama:
                 if not HAS_EXLLAMA:
@@ -348,7 +348,7 @@ class Weights:
                 log_once(
                     logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
                 )
-                from text_generation_server.utils.awq.conversion_utils import (
+                from text_generation_server.layers.awq.conversion_utils import (
                     fast_awq_to_gptq,
                 )
 

From d348d2b28feeaab7a8f6bd44cc8924b6b4ae7868 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 13 May 2024 13:46:29 +0200
Subject: [PATCH 08/12] Granite support? (#1882)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 server/poetry.lock                            | 502 +++++++++---------
 server/requirements_cuda.txt                  |   8 +-
 server/requirements_rocm.txt                  |   8 +-
 .../custom_modeling/flash_llama_modeling.py   |  57 +-
 4 files changed, 276 insertions(+), 299 deletions(-)

diff --git a/server/poetry.lock b/server/poetry.lock
index 3a554af0..684713f8 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -642,69 +642,61 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.62.2"
+version = "1.63.0"
 description = "HTTP/2-based RPC framework"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "grpcio-1.62.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:66344ea741124c38588a664237ac2fa16dfd226964cca23ddc96bd4accccbde5"},
-    {file = "grpcio-1.62.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:5dab7ac2c1e7cb6179c6bfad6b63174851102cbe0682294e6b1d6f0981ad7138"},
-    {file = "grpcio-1.62.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:3ad00f3f0718894749d5a8bb0fa125a7980a2f49523731a9b1fabf2b3522aa43"},
-    {file = "grpcio-1.62.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e72ddfee62430ea80133d2cbe788e0d06b12f865765cb24a40009668bd8ea05"},
-    {file = "grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53d3a59a10af4c2558a8e563aed9f256259d2992ae0d3037817b2155f0341de1"},
-    {file = "grpcio-1.62.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1511a303f8074f67af4119275b4f954189e8313541da7b88b1b3a71425cdb10"},
-    {file = "grpcio-1.62.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b94d41b7412ef149743fbc3178e59d95228a7064c5ab4760ae82b562bdffb199"},
-    {file = "grpcio-1.62.2-cp310-cp310-win32.whl", hash = "sha256:a75af2fc7cb1fe25785be7bed1ab18cef959a376cdae7c6870184307614caa3f"},
-    {file = "grpcio-1.62.2-cp310-cp310-win_amd64.whl", hash = "sha256:80407bc007754f108dc2061e37480238b0dc1952c855e86a4fc283501ee6bb5d"},
-    {file = "grpcio-1.62.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:c1624aa686d4b36790ed1c2e2306cc3498778dffaf7b8dd47066cf819028c3ad"},
-    {file = "grpcio-1.62.2-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:1c1bb80299bdef33309dff03932264636450c8fdb142ea39f47e06a7153d3063"},
-    {file = "grpcio-1.62.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:db068bbc9b1fa16479a82e1ecf172a93874540cb84be69f0b9cb9b7ac3c82670"},
-    {file = "grpcio-1.62.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc8a308780edbe2c4913d6a49dbdb5befacdf72d489a368566be44cadaef1a"},
-    {file = "grpcio-1.62.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0695ae31a89f1a8fc8256050329a91a9995b549a88619263a594ca31b76d756"},
-    {file = "grpcio-1.62.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:88b4f9ee77191dcdd8810241e89340a12cbe050be3e0d5f2f091c15571cd3930"},
-    {file = "grpcio-1.62.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a0204532aa2f1afd467024b02b4069246320405bc18abec7babab03e2644e75"},
-    {file = "grpcio-1.62.2-cp311-cp311-win32.whl", hash = "sha256:6e784f60e575a0de554ef9251cbc2ceb8790914fe324f11e28450047f264ee6f"},
-    {file = "grpcio-1.62.2-cp311-cp311-win_amd64.whl", hash = "sha256:112eaa7865dd9e6d7c0556c8b04ae3c3a2dc35d62ad3373ab7f6a562d8199200"},
-    {file = "grpcio-1.62.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:65034473fc09628a02fb85f26e73885cf1ed39ebd9cf270247b38689ff5942c5"},
-    {file = "grpcio-1.62.2-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d2c1771d0ee3cf72d69bb5e82c6a82f27fbd504c8c782575eddb7839729fbaad"},
-    {file = "grpcio-1.62.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:3abe6838196da518863b5d549938ce3159d809218936851b395b09cad9b5d64a"},
-    {file = "grpcio-1.62.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5ffeb269f10cedb4f33142b89a061acda9f672fd1357331dbfd043422c94e9e"},
-    {file = "grpcio-1.62.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404d3b4b6b142b99ba1cff0b2177d26b623101ea2ce51c25ef6e53d9d0d87bcc"},
-    {file = "grpcio-1.62.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:262cda97efdabb20853d3b5a4c546a535347c14b64c017f628ca0cc7fa780cc6"},
-    {file = "grpcio-1.62.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:17708db5b11b966373e21519c4c73e5a750555f02fde82276ea2a267077c68ad"},
-    {file = "grpcio-1.62.2-cp312-cp312-win32.whl", hash = "sha256:b7ec9e2f8ffc8436f6b642a10019fc513722858f295f7efc28de135d336ac189"},
-    {file = "grpcio-1.62.2-cp312-cp312-win_amd64.whl", hash = "sha256:aa787b83a3cd5e482e5c79be030e2b4a122ecc6c5c6c4c42a023a2b581fdf17b"},
-    {file = "grpcio-1.62.2-cp37-cp37m-linux_armv7l.whl", hash = "sha256:cfd23ad29bfa13fd4188433b0e250f84ec2c8ba66b14a9877e8bce05b524cf54"},
-    {file = "grpcio-1.62.2-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:af15e9efa4d776dfcecd1d083f3ccfb04f876d613e90ef8432432efbeeac689d"},
-    {file = "grpcio-1.62.2-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:f4aa94361bb5141a45ca9187464ae81a92a2a135ce2800b2203134f7a1a1d479"},
-    {file = "grpcio-1.62.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82af3613a219512a28ee5c95578eb38d44dd03bca02fd918aa05603c41018051"},
-    {file = "grpcio-1.62.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55ddaf53474e8caeb29eb03e3202f9d827ad3110475a21245f3c7712022882a9"},
-    {file = "grpcio-1.62.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79b518c56dddeec79e5500a53d8a4db90da995dfe1738c3ac57fe46348be049"},
-    {file = "grpcio-1.62.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a5eb4844e5e60bf2c446ef38c5b40d7752c6effdee882f716eb57ae87255d20a"},
-    {file = "grpcio-1.62.2-cp37-cp37m-win_amd64.whl", hash = "sha256:aaae70364a2d1fb238afd6cc9fcb10442b66e397fd559d3f0968d28cc3ac929c"},
-    {file = "grpcio-1.62.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:1bcfe5070e4406f489e39325b76caeadab28c32bf9252d3ae960c79935a4cc36"},
-    {file = "grpcio-1.62.2-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:da6a7b6b938c15fa0f0568e482efaae9c3af31963eec2da4ff13a6d8ec2888e4"},
-    {file = "grpcio-1.62.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:41955b641c34db7d84db8d306937b72bc4968eef1c401bea73081a8d6c3d8033"},
-    {file = "grpcio-1.62.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c772f225483905f675cb36a025969eef9712f4698364ecd3a63093760deea1bc"},
-    {file = "grpcio-1.62.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07ce1f775d37ca18c7a141300e5b71539690efa1f51fe17f812ca85b5e73262f"},
-    {file = "grpcio-1.62.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:26f415f40f4a93579fd648f48dca1c13dfacdfd0290f4a30f9b9aeb745026811"},
-    {file = "grpcio-1.62.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:db707e3685ff16fc1eccad68527d072ac8bdd2e390f6daa97bc394ea7de4acea"},
-    {file = "grpcio-1.62.2-cp38-cp38-win32.whl", hash = "sha256:589ea8e75de5fd6df387de53af6c9189c5231e212b9aa306b6b0d4f07520fbb9"},
-    {file = "grpcio-1.62.2-cp38-cp38-win_amd64.whl", hash = "sha256:3c3ed41f4d7a3aabf0f01ecc70d6b5d00ce1800d4af652a549de3f7cf35c4abd"},
-    {file = "grpcio-1.62.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:162ccf61499c893831b8437120600290a99c0bc1ce7b51f2c8d21ec87ff6af8b"},
-    {file = "grpcio-1.62.2-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:f27246d7da7d7e3bd8612f63785a7b0c39a244cf14b8dd9dd2f2fab939f2d7f1"},
-    {file = "grpcio-1.62.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:2507006c8a478f19e99b6fe36a2464696b89d40d88f34e4b709abe57e1337467"},
-    {file = "grpcio-1.62.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a90ac47a8ce934e2c8d71e317d2f9e7e6aaceb2d199de940ce2c2eb611b8c0f4"},
-    {file = "grpcio-1.62.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99701979bcaaa7de8d5f60476487c5df8f27483624f1f7e300ff4669ee44d1f2"},
-    {file = "grpcio-1.62.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:af7dc3f7a44f10863b1b0ecab4078f0a00f561aae1edbd01fd03ad4dcf61c9e9"},
-    {file = "grpcio-1.62.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fa63245271920786f4cb44dcada4983a3516be8f470924528cf658731864c14b"},
-    {file = "grpcio-1.62.2-cp39-cp39-win32.whl", hash = "sha256:c6ad9c39704256ed91a1cffc1379d63f7d0278d6a0bad06b0330f5d30291e3a3"},
-    {file = "grpcio-1.62.2-cp39-cp39-win_amd64.whl", hash = "sha256:16da954692fd61aa4941fbeda405a756cd96b97b5d95ca58a92547bba2c1624f"},
-    {file = "grpcio-1.62.2.tar.gz", hash = "sha256:c77618071d96b7a8be2c10701a98537823b9c65ba256c0b9067e0594cdbd954d"},
+    {file = "grpcio-1.63.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2e93aca840c29d4ab5db93f94ed0a0ca899e241f2e8aec6334ab3575dc46125c"},
+    {file = "grpcio-1.63.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:91b73d3f1340fefa1e1716c8c1ec9930c676d6b10a3513ab6c26004cb02d8b3f"},
+    {file = "grpcio-1.63.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:b3afbd9d6827fa6f475a4f91db55e441113f6d3eb9b7ebb8fb806e5bb6d6bd0d"},
+    {file = "grpcio-1.63.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f3f6883ce54a7a5f47db43289a0a4c776487912de1a0e2cc83fdaec9685cc9f"},
+    {file = "grpcio-1.63.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf8dae9cc0412cb86c8de5a8f3be395c5119a370f3ce2e69c8b7d46bb9872c8d"},
+    {file = "grpcio-1.63.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:08e1559fd3b3b4468486b26b0af64a3904a8dbc78d8d936af9c1cf9636eb3e8b"},
+    {file = "grpcio-1.63.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5c039ef01516039fa39da8a8a43a95b64e288f79f42a17e6c2904a02a319b357"},
+    {file = "grpcio-1.63.0-cp310-cp310-win32.whl", hash = "sha256:ad2ac8903b2eae071055a927ef74121ed52d69468e91d9bcbd028bd0e554be6d"},
+    {file = "grpcio-1.63.0-cp310-cp310-win_amd64.whl", hash = "sha256:b2e44f59316716532a993ca2966636df6fbe7be4ab6f099de6815570ebe4383a"},
+    {file = "grpcio-1.63.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:f28f8b2db7b86c77916829d64ab21ff49a9d8289ea1564a2b2a3a8ed9ffcccd3"},
+    {file = "grpcio-1.63.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:65bf975639a1f93bee63ca60d2e4951f1b543f498d581869922910a476ead2f5"},
+    {file = "grpcio-1.63.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:b5194775fec7dc3dbd6a935102bb156cd2c35efe1685b0a46c67b927c74f0cfb"},
+    {file = "grpcio-1.63.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4cbb2100ee46d024c45920d16e888ee5d3cf47c66e316210bc236d5bebc42b3"},
+    {file = "grpcio-1.63.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ff737cf29b5b801619f10e59b581869e32f400159e8b12d7a97e7e3bdeee6a2"},
+    {file = "grpcio-1.63.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cd1e68776262dd44dedd7381b1a0ad09d9930ffb405f737d64f505eb7f77d6c7"},
+    {file = "grpcio-1.63.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:93f45f27f516548e23e4ec3fbab21b060416007dbe768a111fc4611464cc773f"},
+    {file = "grpcio-1.63.0-cp311-cp311-win32.whl", hash = "sha256:878b1d88d0137df60e6b09b74cdb73db123f9579232c8456f53e9abc4f62eb3c"},
+    {file = "grpcio-1.63.0-cp311-cp311-win_amd64.whl", hash = "sha256:756fed02dacd24e8f488f295a913f250b56b98fb793f41d5b2de6c44fb762434"},
+    {file = "grpcio-1.63.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:93a46794cc96c3a674cdfb59ef9ce84d46185fe9421baf2268ccb556f8f81f57"},
+    {file = "grpcio-1.63.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:a7b19dfc74d0be7032ca1eda0ed545e582ee46cd65c162f9e9fc6b26ef827dc6"},
+    {file = "grpcio-1.63.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:8064d986d3a64ba21e498b9a376cbc5d6ab2e8ab0e288d39f266f0fca169b90d"},
+    {file = "grpcio-1.63.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:219bb1848cd2c90348c79ed0a6b0ea51866bc7e72fa6e205e459fedab5770172"},
+    {file = "grpcio-1.63.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2d60cd1d58817bc5985fae6168d8b5655c4981d448d0f5b6194bbcc038090d2"},
+    {file = "grpcio-1.63.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9e350cb096e5c67832e9b6e018cf8a0d2a53b2a958f6251615173165269a91b0"},
+    {file = "grpcio-1.63.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:56cdf96ff82e3cc90dbe8bac260352993f23e8e256e063c327b6cf9c88daf7a9"},
+    {file = "grpcio-1.63.0-cp312-cp312-win32.whl", hash = "sha256:3a6d1f9ea965e750db7b4ee6f9fdef5fdf135abe8a249e75d84b0a3e0c668a1b"},
+    {file = "grpcio-1.63.0-cp312-cp312-win_amd64.whl", hash = "sha256:d2497769895bb03efe3187fb1888fc20e98a5f18b3d14b606167dacda5789434"},
+    {file = "grpcio-1.63.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:fdf348ae69c6ff484402cfdb14e18c1b0054ac2420079d575c53a60b9b2853ae"},
+    {file = "grpcio-1.63.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a3abfe0b0f6798dedd2e9e92e881d9acd0fdb62ae27dcbbfa7654a57e24060c0"},
+    {file = "grpcio-1.63.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:6ef0ad92873672a2a3767cb827b64741c363ebaa27e7f21659e4e31f4d750280"},
+    {file = "grpcio-1.63.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b416252ac5588d9dfb8a30a191451adbf534e9ce5f56bb02cd193f12d8845b7f"},
+    {file = "grpcio-1.63.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3b77eaefc74d7eb861d3ffbdf91b50a1bb1639514ebe764c47773b833fa2d91"},
+    {file = "grpcio-1.63.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b005292369d9c1f80bf70c1db1c17c6c342da7576f1c689e8eee4fb0c256af85"},
+    {file = "grpcio-1.63.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cdcda1156dcc41e042d1e899ba1f5c2e9f3cd7625b3d6ebfa619806a4c1aadda"},
+    {file = "grpcio-1.63.0-cp38-cp38-win32.whl", hash = "sha256:01799e8649f9e94ba7db1aeb3452188048b0019dc37696b0f5ce212c87c560c3"},
+    {file = "grpcio-1.63.0-cp38-cp38-win_amd64.whl", hash = "sha256:6a1a3642d76f887aa4009d92f71eb37809abceb3b7b5a1eec9c554a246f20e3a"},
+    {file = "grpcio-1.63.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:75f701ff645858a2b16bc8c9fc68af215a8bb2d5a9b647448129de6e85d52bce"},
+    {file = "grpcio-1.63.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cacdef0348a08e475a721967f48206a2254a1b26ee7637638d9e081761a5ba86"},
+    {file = "grpcio-1.63.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:0697563d1d84d6985e40ec5ec596ff41b52abb3fd91ec240e8cb44a63b895094"},
+    {file = "grpcio-1.63.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6426e1fb92d006e47476d42b8f240c1d916a6d4423c5258ccc5b105e43438f61"},
+    {file = "grpcio-1.63.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e48cee31bc5f5a31fb2f3b573764bd563aaa5472342860edcc7039525b53e46a"},
+    {file = "grpcio-1.63.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:50344663068041b34a992c19c600236e7abb42d6ec32567916b87b4c8b8833b3"},
+    {file = "grpcio-1.63.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:259e11932230d70ef24a21b9fb5bb947eb4703f57865a404054400ee92f42f5d"},
+    {file = "grpcio-1.63.0-cp39-cp39-win32.whl", hash = "sha256:a44624aad77bf8ca198c55af811fd28f2b3eaf0a50ec5b57b06c034416ef2d0a"},
+    {file = "grpcio-1.63.0-cp39-cp39-win_amd64.whl", hash = "sha256:166e5c460e5d7d4656ff9e63b13e1f6029b122104c1633d5f37eaea348d7356d"},
+    {file = "grpcio-1.63.0.tar.gz", hash = "sha256:f3023e14805c61bc439fb40ca545ac3d5740ce66120a678a3c6c2c55b70343d1"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.62.2)"]
+protobuf = ["grpcio-tools (>=1.63.0)"]
 
 [[package]]
 name = "grpcio-reflection"
@@ -959,13 +951,13 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.3"
+version = "3.1.4"
 description = "A very fast and expressive template engine."
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"},
-    {file = "Jinja2-3.1.3.tar.gz", hash = "sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90"},
+    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
+    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
 ]
 
 [package.dependencies]
@@ -976,24 +968,24 @@ i18n = ["Babel (>=2.7)"]
 
 [[package]]
 name = "joblib"
-version = "1.4.0"
+version = "1.4.2"
 description = "Lightweight pipelining with Python functions"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "joblib-1.4.0-py3-none-any.whl", hash = "sha256:42942470d4062537be4d54c83511186da1fc14ba354961a2114da91efa9a4ed7"},
-    {file = "joblib-1.4.0.tar.gz", hash = "sha256:1eb0dc091919cd384490de890cb5dfd538410a6d4b3b54eef09fb8c50b409b1c"},
+    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
+    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
 ]
 
 [[package]]
 name = "jsonschema"
-version = "4.21.1"
+version = "4.22.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "jsonschema-4.21.1-py3-none-any.whl", hash = "sha256:7996507afae316306f9e2290407761157c6f78002dcf7419acb99822143d1c6f"},
-    {file = "jsonschema-4.21.1.tar.gz", hash = "sha256:85727c00279f5fa6bedbe6238d2aa6403bedd8b4864ab11207d07df3cc1b2ee5"},
+    {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"},
+    {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"},
 ]
 
 [package.dependencies]
@@ -2307,13 +2299,13 @@ files = [
 
 [[package]]
 name = "referencing"
-version = "0.35.0"
+version = "0.35.1"
 description = "JSON Referencing + Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "referencing-0.35.0-py3-none-any.whl", hash = "sha256:8080727b30e364e5783152903672df9b6b091c926a146a759080b62ca3126cd6"},
-    {file = "referencing-0.35.0.tar.gz", hash = "sha256:191e936b0c696d0af17ad7430a3dc68e88bc11be6514f4757dc890f04ab05889"},
+    {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
+    {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
 ]
 
 [package.dependencies]
@@ -2322,90 +2314,90 @@ rpds-py = ">=0.7.0"
 
 [[package]]
 name = "regex"
-version = "2024.4.28"
+version = "2024.5.10"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
-    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
-    {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
-    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
-    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
-    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
-    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
-    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
-    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
-    {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
-    {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
-    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
-    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
-    {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
-    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
-    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
-    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
-    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
-    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
-    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
-    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
-    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
-    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
-    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
-    {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
-    {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
-    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
-    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
-    {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
-    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
-    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
-    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
-    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
-    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
-    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
-    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
-    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
-    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
-    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
-    {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
-    {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
-    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
-    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
-    {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
-    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
-    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
-    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
-    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
-    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
-    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
-    {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
-    {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
-    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
-    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
-    {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
-    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
-    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
-    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
-    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
-    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
-    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
-    {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
-    {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
-    {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
+    {file = "regex-2024.5.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:eda3dd46df535da787ffb9036b5140f941ecb91701717df91c9daf64cabef953"},
+    {file = "regex-2024.5.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d5bd666466c8f00a06886ce1397ba8b12371c1f1c6d1bef11013e9e0a1464a8"},
+    {file = "regex-2024.5.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32e5f3b8e32918bfbdd12eca62e49ab3031125c454b507127ad6ecbd86e62fca"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:534efd2653ebc4f26fc0e47234e53bf0cb4715bb61f98c64d2774a278b58c846"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:193b7c6834a06f722f0ce1ba685efe80881de7c3de31415513862f601097648c"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:160ba087232c5c6e2a1e7ad08bd3a3f49b58c815be0504d8c8aacfb064491cd8"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951be1eae7b47660412dc4938777a975ebc41936d64e28081bf2e584b47ec246"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8a0f0ab5453e409586b11ebe91c672040bc804ca98d03a656825f7890cbdf88"},
+    {file = "regex-2024.5.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9e6d4d6ae1827b2f8c7200aaf7501c37cf3f3896c86a6aaf2566448397c823dd"},
+    {file = "regex-2024.5.10-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:161a206c8f3511e2f5fafc9142a2cc25d7fe9a1ec5ad9b4ad2496a7c33e1c5d2"},
+    {file = "regex-2024.5.10-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:44b3267cea873684af022822195298501568ed44d542f9a2d9bebc0212e99069"},
+    {file = "regex-2024.5.10-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:560278c9975694e1f0bc50da187abf2cdc1e4890739ea33df2bc4a85eeef143e"},
+    {file = "regex-2024.5.10-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:70364a097437dd0a90b31cd77f09f7387ad9ac60ef57590971f43b7fca3082a5"},
+    {file = "regex-2024.5.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:42be5de7cc8c1edac55db92d82b68dc8e683b204d6f5414c5a51997a323d7081"},
+    {file = "regex-2024.5.10-cp310-cp310-win32.whl", hash = "sha256:9a8625849387b9d558d528e263ecc9c0fbde86cfa5c2f0eef43fff480ae24d71"},
+    {file = "regex-2024.5.10-cp310-cp310-win_amd64.whl", hash = "sha256:903350bf44d7e4116b4d5898b30b15755d61dcd3161e3413a49c7db76f0bee5a"},
+    {file = "regex-2024.5.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bf9596cba92ce7b1fd32c7b07c6e3212c7eed0edc271757e48bfcd2b54646452"},
+    {file = "regex-2024.5.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:45cc13d398b6359a7708986386f72bd156ae781c3e83a68a6d4cee5af04b1ce9"},
+    {file = "regex-2024.5.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ad45f3bccfcb00868f2871dce02a755529838d2b86163ab8a246115e80cfb7d6"},
+    {file = "regex-2024.5.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33d19f0cde6838c81acffff25c7708e4adc7dd02896c9ec25c3939b1500a1778"},
+    {file = "regex-2024.5.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0a9f89d7db5ef6bdf53e5cc8e6199a493d0f1374b3171796b464a74ebe8e508a"},
+    {file = "regex-2024.5.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c6c71cf92b09e5faa72ea2c68aa1f61c9ce11cb66fdc5069d712f4392ddfd00"},
+    {file = "regex-2024.5.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7467ad8b0eac0b28e52679e972b9b234b3de0ea5cee12eb50091d2b68145fe36"},
+    {file = "regex-2024.5.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bc0db93ad039fc2fe32ccd3dd0e0e70c4f3d6e37ae83f0a487e1aba939bd2fbd"},
+    {file = "regex-2024.5.10-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fa9335674d7c819674467c7b46154196c51efbaf5f5715187fd366814ba3fa39"},
+    {file = "regex-2024.5.10-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7dda3091838206969c2b286f9832dff41e2da545b99d1cfaea9ebd8584d02708"},
+    {file = "regex-2024.5.10-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:504b5116e2bd1821efd815941edff7535e93372a098e156bb9dffde30264e798"},
+    {file = "regex-2024.5.10-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:91b53dea84415e8115506cc62e441a2b54537359c63d856d73cb1abe05af4c9a"},
+    {file = "regex-2024.5.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1a3903128f9e17a500618e80c68165c78c741ebb17dd1a0b44575f92c3c68b02"},
+    {file = "regex-2024.5.10-cp311-cp311-win32.whl", hash = "sha256:236cace6c1903effd647ed46ce6dd5d76d54985fc36dafc5256032886736c85d"},
+    {file = "regex-2024.5.10-cp311-cp311-win_amd64.whl", hash = "sha256:12446827f43c7881decf2c126762e11425de5eb93b3b0d8b581344c16db7047a"},
+    {file = "regex-2024.5.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14905ed75c7a6edf423eb46c213ed3f4507c38115f1ed3c00f4ec9eafba50e58"},
+    {file = "regex-2024.5.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:4fad420b14ae1970a1f322e8ae84a1d9d89375eb71e1b504060ab2d1bfe68f3c"},
+    {file = "regex-2024.5.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c46a76a599fcbf95f98755275c5527304cc4f1bb69919434c1e15544d7052910"},
+    {file = "regex-2024.5.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0faecb6d5779753a6066a3c7a0471a8d29fe25d9981ca9e552d6d1b8f8b6a594"},
+    {file = "regex-2024.5.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aab65121229c2ecdf4a31b793d99a6a0501225bd39b616e653c87b219ed34a49"},
+    {file = "regex-2024.5.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50e7e96a527488334379e05755b210b7da4a60fc5d6481938c1fa053e0c92184"},
+    {file = "regex-2024.5.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba034c8db4b264ef1601eb33cd23d87c5013b8fb48b8161debe2e5d3bd9156b0"},
+    {file = "regex-2024.5.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:031219782d97550c2098d9a68ce9e9eaefe67d2d81d8ff84c8354f9c009e720c"},
+    {file = "regex-2024.5.10-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62b5f7910b639f3c1d122d408421317c351e213ca39c964ad4121f27916631c6"},
+    {file = "regex-2024.5.10-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cd832bd9b6120d6074f39bdfbb3c80e416848b07ac72910f1c7f03131a6debc3"},
+    {file = "regex-2024.5.10-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:e91b1976358e17197157b405cab408a5f4e33310cda211c49fc6da7cffd0b2f0"},
+    {file = "regex-2024.5.10-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:571452362d552de508c37191b6abbbb660028b8b418e2d68c20779e0bc8eaaa8"},
+    {file = "regex-2024.5.10-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5253dcb0bfda7214523de58b002eb0090cb530d7c55993ce5f6d17faf953ece7"},
+    {file = "regex-2024.5.10-cp312-cp312-win32.whl", hash = "sha256:2f30a5ab8902f93930dc6f627c4dd5da2703333287081c85cace0fc6e21c25af"},
+    {file = "regex-2024.5.10-cp312-cp312-win_amd64.whl", hash = "sha256:3799e36d60a35162bb35b2246d8bb012192b7437dff807ef79c14e7352706306"},
+    {file = "regex-2024.5.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:bbdc5db2c98ac2bf1971ffa1410c87ca7a15800415f788971e8ba8520fc0fda9"},
+    {file = "regex-2024.5.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6ccdeef4584450b6f0bddd5135354908dacad95425fcb629fe36d13e48b60f32"},
+    {file = "regex-2024.5.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:29d839829209f3c53f004e1de8c3113efce6d98029f044fa5cfee666253ee7e6"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0709ba544cf50bd5cb843df4b8bb6701bae2b70a8e88da9add8386cbca5c1385"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:972b49f2fe1047b9249c958ec4fa1bdd2cf8ce305dc19d27546d5a38e57732d8"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9cdbb1998da94607d5eec02566b9586f0e70d6438abf1b690261aac0edda7ab6"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf7c8ee4861d9ef5b1120abb75846828c811f932d63311596ad25fa168053e00"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d35d4cc9270944e95f9c88af757b0c9fc43f396917e143a5756608462c5223b"},
+    {file = "regex-2024.5.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8722f72068b3e1156a4b2e1afde6810f1fc67155a9fa30a4b9d5b4bc46f18fb0"},
+    {file = "regex-2024.5.10-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:696639a73ca78a380acfaa0a1f6dd8220616a99074c05bba9ba8bb916914b224"},
+    {file = "regex-2024.5.10-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea057306ab469130167014b662643cfaed84651c792948891d003cf0039223a5"},
+    {file = "regex-2024.5.10-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:b43b78f9386d3d932a6ce5af4b45f393d2e93693ee18dc4800d30a8909df700e"},
+    {file = "regex-2024.5.10-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c43395a3b7cc9862801a65c6994678484f186ce13c929abab44fb8a9e473a55a"},
+    {file = "regex-2024.5.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0bc94873ba11e34837bffd7e5006703abeffc4514e2f482022f46ce05bd25e67"},
+    {file = "regex-2024.5.10-cp38-cp38-win32.whl", hash = "sha256:1118ba9def608250250f4b3e3f48c62f4562ba16ca58ede491b6e7554bfa09ff"},
+    {file = "regex-2024.5.10-cp38-cp38-win_amd64.whl", hash = "sha256:458d68d34fb74b906709735c927c029e62f7d06437a98af1b5b6258025223210"},
+    {file = "regex-2024.5.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:15e593386ec6331e0ab4ac0795b7593f02ab2f4b30a698beb89fbdc34f92386a"},
+    {file = "regex-2024.5.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ca23b41355ba95929e9505ee04e55495726aa2282003ed9b012d86f857d3e49b"},
+    {file = "regex-2024.5.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2c8982ee19ccecabbaeac1ba687bfef085a6352a8c64f821ce2f43e6d76a9298"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7117cb7d6ac7f2e985f3d18aa8a1728864097da1a677ffa69e970ca215baebf1"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b66421f8878a0c82fc0c272a43e2121c8d4c67cb37429b764f0d5ad70b82993b"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224a9269f133564109ce668213ef3cb32bc72ccf040b0b51c72a50e569e9dc9e"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab98016541543692a37905871a5ffca59b16e08aacc3d7d10a27297b443f572d"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51d27844763c273a122e08a3e86e7aefa54ee09fb672d96a645ece0454d8425e"},
+    {file = "regex-2024.5.10-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:853cc36e756ff673bf984e9044ccc8fad60b95a748915dddeab9488aea974c73"},
+    {file = "regex-2024.5.10-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e7eaf9df15423d07b6050fb91f86c66307171b95ea53e2d87a7993b6d02c7f7"},
+    {file = "regex-2024.5.10-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:169fd0acd7a259f58f417e492e93d0e15fc87592cd1e971c8c533ad5703b5830"},
+    {file = "regex-2024.5.10-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:334b79ce9c08f26b4659a53f42892793948a613c46f1b583e985fd5a6bf1c149"},
+    {file = "regex-2024.5.10-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:f03b1dbd4d9596dd84955bb40f7d885204d6aac0d56a919bb1e0ff2fb7e1735a"},
+    {file = "regex-2024.5.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfa6d61a76c77610ba9274c1a90a453062bdf6887858afbe214d18ad41cf6bde"},
+    {file = "regex-2024.5.10-cp39-cp39-win32.whl", hash = "sha256:249fbcee0a277c32a3ce36d8e36d50c27c968fdf969e0fbe342658d4e010fbc8"},
+    {file = "regex-2024.5.10-cp39-cp39-win_amd64.whl", hash = "sha256:0ce56a923f4c01d7568811bfdffe156268c0a7aae8a94c902b92fe34c4bde785"},
+    {file = "regex-2024.5.10.tar.gz", hash = "sha256:304e7e2418146ae4d0ef0e9ffa28f881f7874b45b4994cc2279b21b6e7ae50c8"},
 ]
 
 [[package]]
@@ -2431,110 +2423,110 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rpds-py"
-version = "0.18.0"
+version = "0.18.1"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "rpds_py-0.18.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:5b4e7d8d6c9b2e8ee2d55c90b59c707ca59bc30058269b3db7b1f8df5763557e"},
-    {file = "rpds_py-0.18.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c463ed05f9dfb9baebef68048aed8dcdc94411e4bf3d33a39ba97e271624f8f7"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01e36a39af54a30f28b73096dd39b6802eddd04c90dbe161c1b8dbe22353189f"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d62dec4976954a23d7f91f2f4530852b0c7608116c257833922a896101336c51"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dd18772815d5f008fa03d2b9a681ae38d5ae9f0e599f7dda233c439fcaa00d40"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:923d39efa3cfb7279a0327e337a7958bff00cc447fd07a25cddb0a1cc9a6d2da"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39514da80f971362f9267c600b6d459bfbbc549cffc2cef8e47474fddc9b45b1"},
-    {file = "rpds_py-0.18.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a34d557a42aa28bd5c48a023c570219ba2593bcbbb8dc1b98d8cf5d529ab1434"},
-    {file = "rpds_py-0.18.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:93df1de2f7f7239dc9cc5a4a12408ee1598725036bd2dedadc14d94525192fc3"},
-    {file = "rpds_py-0.18.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:34b18ba135c687f4dac449aa5157d36e2cbb7c03cbea4ddbd88604e076aa836e"},
-    {file = "rpds_py-0.18.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c0b5dcf9193625afd8ecc92312d6ed78781c46ecbf39af9ad4681fc9f464af88"},
-    {file = "rpds_py-0.18.0-cp310-none-win32.whl", hash = "sha256:c4325ff0442a12113a6379af66978c3fe562f846763287ef66bdc1d57925d337"},
-    {file = "rpds_py-0.18.0-cp310-none-win_amd64.whl", hash = "sha256:7223a2a5fe0d217e60a60cdae28d6949140dde9c3bcc714063c5b463065e3d66"},
-    {file = "rpds_py-0.18.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3a96e0c6a41dcdba3a0a581bbf6c44bb863f27c541547fb4b9711fd8cf0ffad4"},
-    {file = "rpds_py-0.18.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30f43887bbae0d49113cbaab729a112251a940e9b274536613097ab8b4899cf6"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fcb25daa9219b4cf3a0ab24b0eb9a5cc8949ed4dc72acb8fa16b7e1681aa3c58"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d68c93e381010662ab873fea609bf6c0f428b6d0bb00f2c6939782e0818d37bf"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b34b7aa8b261c1dbf7720b5d6f01f38243e9b9daf7e6b8bc1fd4657000062f2c"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e6d75ab12b0bbab7215e5d40f1e5b738aa539598db27ef83b2ec46747df90e1"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b8612cd233543a3781bc659c731b9d607de65890085098986dfd573fc2befe5"},
-    {file = "rpds_py-0.18.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aec493917dd45e3c69d00a8874e7cbed844efd935595ef78a0f25f14312e33c6"},
-    {file = "rpds_py-0.18.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:661d25cbffaf8cc42e971dd570d87cb29a665f49f4abe1f9e76be9a5182c4688"},
-    {file = "rpds_py-0.18.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1df3659d26f539ac74fb3b0c481cdf9d725386e3552c6fa2974f4d33d78e544b"},
-    {file = "rpds_py-0.18.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1ce3ba137ed54f83e56fb983a5859a27d43a40188ba798993812fed73c70836"},
-    {file = "rpds_py-0.18.0-cp311-none-win32.whl", hash = "sha256:69e64831e22a6b377772e7fb337533c365085b31619005802a79242fee620bc1"},
-    {file = "rpds_py-0.18.0-cp311-none-win_amd64.whl", hash = "sha256:998e33ad22dc7ec7e030b3df701c43630b5bc0d8fbc2267653577e3fec279afa"},
-    {file = "rpds_py-0.18.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7f2facbd386dd60cbbf1a794181e6aa0bd429bd78bfdf775436020172e2a23f0"},
-    {file = "rpds_py-0.18.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1d9a5be316c15ffb2b3c405c4ff14448c36b4435be062a7f578ccd8b01f0c4d8"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd5bf1af8efe569654bbef5a3e0a56eca45f87cfcffab31dd8dde70da5982475"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5417558f6887e9b6b65b4527232553c139b57ec42c64570569b155262ac0754f"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:56a737287efecafc16f6d067c2ea0117abadcd078d58721f967952db329a3e5c"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8f03bccbd8586e9dd37219bce4d4e0d3ab492e6b3b533e973fa08a112cb2ffc9"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4457a94da0d5c53dc4b3e4de1158bdab077db23c53232f37a3cb7afdb053a4e3"},
-    {file = "rpds_py-0.18.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0ab39c1ba9023914297dd88ec3b3b3c3f33671baeb6acf82ad7ce883f6e8e157"},
-    {file = "rpds_py-0.18.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9d54553c1136b50fd12cc17e5b11ad07374c316df307e4cfd6441bea5fb68496"},
-    {file = "rpds_py-0.18.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0af039631b6de0397ab2ba16eaf2872e9f8fca391b44d3d8cac317860a700a3f"},
-    {file = "rpds_py-0.18.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:84ffab12db93b5f6bad84c712c92060a2d321b35c3c9960b43d08d0f639d60d7"},
-    {file = "rpds_py-0.18.0-cp312-none-win32.whl", hash = "sha256:685537e07897f173abcf67258bee3c05c374fa6fff89d4c7e42fb391b0605e98"},
-    {file = "rpds_py-0.18.0-cp312-none-win_amd64.whl", hash = "sha256:e003b002ec72c8d5a3e3da2989c7d6065b47d9eaa70cd8808b5384fbb970f4ec"},
-    {file = "rpds_py-0.18.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:08f9ad53c3f31dfb4baa00da22f1e862900f45908383c062c27628754af2e88e"},
-    {file = "rpds_py-0.18.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0013fe6b46aa496a6749c77e00a3eb07952832ad6166bd481c74bda0dcb6d58"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e32a92116d4f2a80b629778280103d2a510a5b3f6314ceccd6e38006b5e92dcb"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e541ec6f2ec456934fd279a3120f856cd0aedd209fc3852eca563f81738f6861"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bed88b9a458e354014d662d47e7a5baafd7ff81c780fd91584a10d6ec842cb73"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2644e47de560eb7bd55c20fc59f6daa04682655c58d08185a9b95c1970fa1e07"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e8916ae4c720529e18afa0b879473049e95949bf97042e938530e072fde061d"},
-    {file = "rpds_py-0.18.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:465a3eb5659338cf2a9243e50ad9b2296fa15061736d6e26240e713522b6235c"},
-    {file = "rpds_py-0.18.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ea7d4a99f3b38c37eac212dbd6ec42b7a5ec51e2c74b5d3223e43c811609e65f"},
-    {file = "rpds_py-0.18.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:67071a6171e92b6da534b8ae326505f7c18022c6f19072a81dcf40db2638767c"},
-    {file = "rpds_py-0.18.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:41ef53e7c58aa4ef281da975f62c258950f54b76ec8e45941e93a3d1d8580594"},
-    {file = "rpds_py-0.18.0-cp38-none-win32.whl", hash = "sha256:fdea4952db2793c4ad0bdccd27c1d8fdd1423a92f04598bc39425bcc2b8ee46e"},
-    {file = "rpds_py-0.18.0-cp38-none-win_amd64.whl", hash = "sha256:7cd863afe7336c62ec78d7d1349a2f34c007a3cc6c2369d667c65aeec412a5b1"},
-    {file = "rpds_py-0.18.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5307def11a35f5ae4581a0b658b0af8178c65c530e94893345bebf41cc139d33"},
-    {file = "rpds_py-0.18.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:77f195baa60a54ef9d2de16fbbfd3ff8b04edc0c0140a761b56c267ac11aa467"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39f5441553f1c2aed4de4377178ad8ff8f9d733723d6c66d983d75341de265ab"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a00312dea9310d4cb7dbd7787e722d2e86a95c2db92fbd7d0155f97127bcb40"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f2fc11e8fe034ee3c34d316d0ad8808f45bc3b9ce5857ff29d513f3ff2923a1"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:586f8204935b9ec884500498ccc91aa869fc652c40c093bd9e1471fbcc25c022"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddc2f4dfd396c7bfa18e6ce371cba60e4cf9d2e5cdb71376aa2da264605b60b9"},
-    {file = "rpds_py-0.18.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ddcba87675b6d509139d1b521e0c8250e967e63b5909a7e8f8944d0f90ff36f"},
-    {file = "rpds_py-0.18.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7bd339195d84439cbe5771546fe8a4e8a7a045417d8f9de9a368c434e42a721e"},
-    {file = "rpds_py-0.18.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:d7c36232a90d4755b720fbd76739d8891732b18cf240a9c645d75f00639a9024"},
-    {file = "rpds_py-0.18.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6b0817e34942b2ca527b0e9298373e7cc75f429e8da2055607f4931fded23e20"},
-    {file = "rpds_py-0.18.0-cp39-none-win32.whl", hash = "sha256:99f70b740dc04d09e6b2699b675874367885217a2e9f782bdf5395632ac663b7"},
-    {file = "rpds_py-0.18.0-cp39-none-win_amd64.whl", hash = "sha256:6ef687afab047554a2d366e112dd187b62d261d49eb79b77e386f94644363294"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ad36cfb355e24f1bd37cac88c112cd7730873f20fb0bdaf8ba59eedf8216079f"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:36b3ee798c58ace201289024b52788161e1ea133e4ac93fba7d49da5fec0ef9e"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8a2f084546cc59ea99fda8e070be2fd140c3092dc11524a71aa8f0f3d5a55ca"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e4461d0f003a0aa9be2bdd1b798a041f177189c1a0f7619fe8c95ad08d9a45d7"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8db715ebe3bb7d86d77ac1826f7d67ec11a70dbd2376b7cc214199360517b641"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:793968759cd0d96cac1e367afd70c235867831983f876a53389ad869b043c948"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e6a3af5a75363d2c9a48b07cb27c4ea542938b1a2e93b15a503cdfa8490795"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ef0befbb5d79cf32d0266f5cff01545602344eda89480e1dd88aca964260b18"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d4acf42190d449d5e89654d5c1ed3a4f17925eec71f05e2a41414689cda02d1"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:a5f446dd5055667aabaee78487f2b5ab72e244f9bc0b2ffebfeec79051679984"},
-    {file = "rpds_py-0.18.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9dbbeb27f4e70bfd9eec1be5477517365afe05a9b2c441a0b21929ee61048124"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:22806714311a69fd0af9b35b7be97c18a0fc2826e6827dbb3a8c94eac6cf7eeb"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b34ae4636dfc4e76a438ab826a0d1eed2589ca7d9a1b2d5bb546978ac6485461"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c8370641f1a7f0e0669ddccca22f1da893cef7628396431eb445d46d893e5cd"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8362467a0fdeccd47935f22c256bec5e6abe543bf0d66e3d3d57a8fb5731863"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11a8c85ef4a07a7638180bf04fe189d12757c696eb41f310d2426895356dcf05"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b316144e85316da2723f9d8dc75bada12fa58489a527091fa1d5a612643d1a0e"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf1ea2e34868f6fbf070e1af291c8180480310173de0b0c43fc38a02929fc0e3"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e546e768d08ad55b20b11dbb78a745151acbd938f8f00d0cfbabe8b0199b9880"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4901165d170a5fde6f589acb90a6b33629ad1ec976d4529e769c6f3d885e3e80"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:618a3d6cae6ef8ec88bb76dd80b83cfe415ad4f1d942ca2a903bf6b6ff97a2da"},
-    {file = "rpds_py-0.18.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ed4eb745efbff0a8e9587d22a84be94a5eb7d2d99c02dacf7bd0911713ed14dd"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c81e5f372cd0dc5dc4809553d34f832f60a46034a5f187756d9b90586c2c307"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:43fbac5f22e25bee1d482c97474f930a353542855f05c1161fd804c9dc74a09d"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d7faa6f14017c0b1e69f5e2c357b998731ea75a442ab3841c0dbbbfe902d2c4"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:08231ac30a842bd04daabc4d71fddd7e6d26189406d5a69535638e4dcb88fe76"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:044a3e61a7c2dafacae99d1e722cc2d4c05280790ec5a05031b3876809d89a5c"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f26b5bd1079acdb0c7a5645e350fe54d16b17bfc5e71f371c449383d3342e17"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:482103aed1dfe2f3b71a58eff35ba105289b8d862551ea576bd15479aba01f66"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1374f4129f9bcca53a1bba0bb86bf78325a0374577cf7e9e4cd046b1e6f20e24"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:635dc434ff724b178cb192c70016cc0ad25a275228f749ee0daf0eddbc8183b1"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:bc362ee4e314870a70f4ae88772d72d877246537d9f8cb8f7eacf10884862432"},
-    {file = "rpds_py-0.18.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:4832d7d380477521a8c1644bbab6588dfedea5e30a7d967b5fb75977c45fd77f"},
-    {file = "rpds_py-0.18.0.tar.gz", hash = "sha256:42821446ee7a76f5d9f71f9e33a4fb2ffd724bb3e7f93386150b61a43115788d"},
+    {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"},
+    {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"},
+    {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"},
+    {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"},
+    {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"},
+    {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"},
+    {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"},
+    {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"},
+    {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"},
+    {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"},
+    {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"},
+    {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"},
+    {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"},
+    {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"},
+    {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"},
+    {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"},
+    {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"},
+    {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"},
+    {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"},
+    {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"},
+    {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"},
 ]
 
 [[package]]
@@ -3004,13 +2996,13 @@ optree = ["optree (>=0.9.1)"]
 
 [[package]]
 name = "tqdm"
-version = "4.66.2"
+version = "4.66.4"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"},
-    {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"},
+    {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
+    {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"},
 ]
 
 [package.dependencies]
@@ -3024,13 +3016,13 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.40.1"
+version = "4.40.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.40.1-py3-none-any.whl", hash = "sha256:9d5ee0c8142a60501faf9e49a0b42f8e9cb8611823bce4f195a9325a6816337e"},
-    {file = "transformers-4.40.1.tar.gz", hash = "sha256:55e1697e6f18b58273e7117bb469cdffc11be28995462d8d5e422fef38d2de36"},
+    {file = "transformers-4.40.2-py3-none-any.whl", hash = "sha256:71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d"},
+    {file = "transformers-4.40.2.tar.gz", hash = "sha256:657b6054a2097671398d976ad46e60836e7e15f9ea9551631a96e33cb9240649"},
 ]
 
 [package.dependencies]
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index c2714764..7f0efded 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -11,7 +11,7 @@ googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version <
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
@@ -32,15 +32,15 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
 protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index c2714764..7f0efded 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -11,7 +11,7 @@ googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version <
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
@@ -32,15 +32,15 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
 protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index a7969494..6a6b2e0a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -23,7 +23,6 @@ import torch.distributed
 
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
 from text_generation_server.utils import paged_attention, flash_attn
@@ -32,7 +31,6 @@ from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
-    get_linear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
@@ -41,22 +39,29 @@ from text_generation_server.layers.layernorm import (
 
 
 def load_attention(config, prefix, weights):
+    bias = config.attention_bias
     if config.num_attention_heads != config.num_key_value_heads:
-        return _load_gqa(config, prefix, weights)
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=bias,
+        )
     else:
         if config.model_type == "baichuan":
             return TensorParallelColumnLinear.load_qkv(
                 config,
                 prefix=f"{prefix}.W_pack",
                 weights=weights,
-                bias=False,
+                bias=bias,
             )
         elif config.model_type == "phi3":
             return TensorParallelColumnLinear.load_qkv(
                 config,
                 prefix=f"{prefix}.qkv_proj",
                 weights=weights,
-                bias=False,
+                bias=bias,
             )
         else:
             return TensorParallelColumnLinear.load_multi(
@@ -64,36 +69,10 @@ def load_attention(config, prefix, weights):
                 prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                 dim=0,
                 weights=weights,
-                bias=False,
+                bias=bias,
             )
 
 
-def _load_gqa(config, prefix: str, weights):
-    assert config.hidden_size % config.num_attention_heads == 0
-    assert config.num_attention_heads % weights.process_group.size() == 0
-
-    weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0,
-    )
-
-    if config.quantize not in ["gptq", "awq"]:
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
-
-        head_size = config.hidden_size // config.num_attention_heads
-        num_heads = config.num_attention_heads // weights.process_group.size()
-        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
-            (num_heads + 2 * num_key_value_heads) * head_size,
-            config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
-
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
-    )
-
-
 class FlashLlamaAttention(torch.nn.Module):
     def __init__(
         self,
@@ -214,12 +193,13 @@ class LlamaMLP(nn.Module):
             )
         )
         # Fuse gate and up proj
+        bias = getattr(config, "mlp_bias", False)
         if config.model_type == "phi3":
             self.gate_up_proj = TensorParallelColumnLinear.load_gate_up(
                 config,
                 prefix=f"{prefix}.gate_up_proj",
                 weights=weights,
-                bias=False,
+                bias=bias,
             )
         else:
             self.gate_up_proj = TensorParallelColumnLinear.load_multi(
@@ -227,13 +207,13 @@ class LlamaMLP(nn.Module):
                 prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
                 weights=weights,
                 dim=0,
-                bias=False,
+                bias=bias,
             )
         self.down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=bias,
         )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
@@ -385,9 +365,14 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             weights=weights,
         )
         self.model = FlashLlamaModel(prefix, config, weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
         self.lm_head = SpeculativeHead.load(
             config,
-            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            prefix=suffix if not prefix else f"{prefix}.suffix",
             weights=weights,
         )
 

From 3136f27f361b1a31ccd469998cc9e99249e759b8 Mon Sep 17 00:00:00 2001
From: Nilabhra Roy Chowdhury <nilabhraroy@gmail.com>
Date: Tue, 14 May 2024 10:06:02 +0200
Subject: [PATCH 09/12] Add: Support for the Falcon2 11B architecture (#1886)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Add's support for the Falcon2 11B model architecture.


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: OlivierDehaene <olivier@huggingface.co>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
Co-authored-by: oOraph <13552058+oOraph@users.noreply.github.com>
Co-authored-by: Raphael Glon <oOraph@users.noreply.github.com>
Co-authored-by: Julien Chaumond <julien@huggingface.co>
Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Co-authored-by: abhishek thakur <1183441+abhishekkrthakur@users.noreply.github.com>
Co-authored-by: Dong Shin <d0104.shin@gmail.com>
Co-authored-by: Christof Weickhardt <christof@weickhardt.ch>
Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com>
Co-authored-by: drbh <david.richard.holtz@gmail.com>
Co-authored-by: Lucain <lucain@huggingface.co>
Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
Co-authored-by: Moritz Laurer <41862082+MoritzLaurer@users.noreply.github.com>
Co-authored-by: dr3s <dr3s@users.noreply.github.com>
Co-authored-by: Wang, Yi <yi.a.wang@intel.com>
Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
Co-authored-by: Maziyar Panahi <maziyar.panahi@iscpif.fr>
Co-authored-by: Brandon Royal <2762697+brandonroyal@users.noreply.github.com>
Co-authored-by: Mishig <mishig.davaadorj@coloradocollege.edu>
Co-authored-by: Martin Iglesias Goyanes <martinigoyanes@hotmail.com>
Co-authored-by: martini <martin.iglesiasgoyanes@adyen.com>
---
 .../custom_modeling/flash_llama_modeling.py   |   3 +-
 .../custom_modeling/flash_rw_modeling.py      | 101 ++++++++++++------
 2 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 6a6b2e0a..40ccb576 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -18,9 +18,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Optional, Tuple
+
 import torch
 import torch.distributed
-
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index 52ea3ae1..fa463a19 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -1,26 +1,21 @@
+from typing import List, Optional, Tuple
+
 import torch
 import torch.distributed
-
 from torch import nn
-from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
-from typing import Optional, List, Tuple
+from transformers.modeling_utils import PreTrainedModel
 
-from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.flash_attn import attention
 from text_generation_server.layers import (
-    TensorParallelRowLinear,
+    SpeculativeHead,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    SpeculativeHead,
+    TensorParallelRowLinear,
     get_linear,
 )
-from text_generation_server.layers.layernorm import (
-    FastLayerNorm,
-)
-from text_generation_server.layers.rotary import (
-    PositionRotaryEmbedding,
-)
+from text_generation_server.layers.layernorm import FastLayerNorm
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils import flash_attn, paged_attention
 
 
 def load_row(config, prefix: str, weights, bias: bool):
@@ -52,6 +47,7 @@ class RWConfig(PretrainedConfig):
         hidden_size=64,
         num_hidden_layers=None,
         num_attention_heads=None,
+        num_ln_in_prallel_attention=None,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         use_cache=True,
@@ -65,6 +61,7 @@ class RWConfig(PretrainedConfig):
         new_decoder_architecture=None,
         bias=False,
         parallel_attn=False,
+        rope_theta=10_000.0,
         **kwargs,
     ):
         if alibi:
@@ -75,6 +72,7 @@ class RWConfig(PretrainedConfig):
         self.model_type = model_type
         self.alibi = False
         self.rotary = True
+        self.rope_theta = rope_theta
 
         self.vocab_size = vocab_size
         # Backward compatibility with n_embed kwarg
@@ -91,6 +89,7 @@ class RWConfig(PretrainedConfig):
             else kwargs.pop("n_head", 8)
         )
         self.layer_norm_epsilon = layer_norm_epsilon
+        self.num_ln_in_parallel_attention = num_ln_in_prallel_attention
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.hidden_dropout = hidden_dropout
@@ -132,9 +131,13 @@ class FlashRWAttention(torch.nn.Module):
         self.num_heads_kv = config.n_head_kv
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
+        self.rope_theta = config.rope_theta
 
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
         )
         self.softmax_scale = self.head_size ** (-0.5)
 
@@ -244,9 +247,13 @@ class FlashRWLargeAttention(torch.nn.Module):
         self.hidden_size = hidden_size
         self.head_size = hidden_size // num_heads
         self.num_groups = num_groups
+        self.rope_theta = config.rope_theta
 
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
         )
         self.softmax_scale = self.head_size ** (-0.5)
 
@@ -257,7 +264,7 @@ class FlashRWLargeAttention(torch.nn.Module):
 
         if process_group.size() > self.num_groups:
             raise NotImplementedError(
-                f"Tensor Parallelism is not implemented for world_size > n groups"
+                "Tensor Parallelism is not implemented for world_size > n groups"
             )
         if self.num_groups % process_group.size() != 0:
             raise NotImplementedError(
@@ -459,29 +466,61 @@ class FlashRWLayer(nn.Module):
                 max_s,
             )
 
-            hidden_states, residual = self.post_attention_layernorm(
-                hidden_states, residual
-            )
+            if self.post_attention_layernorm is not None:
+                hidden_states, residual = self.post_attention_layernorm(
+                    hidden_states, residual
+                )
 
             mlp_output = self.mlp(hidden_states)
 
             return mlp_output, residual
 
 
+class FlashRWLayerNorm(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.num_ln = config.num_ln_in_parallel_attn
+
+        if self.num_ln == 1:
+            self.input_ln = FastLayerNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        elif self.num_ln == 2:
+            self.ln_attn = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_attn",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+            self.ln_mlp = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_mlp",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        else:
+            raise ValueError("Number of layer norms can either be 1 or 2.")
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+    ):
+        if self.num_ln == 1:
+            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
+            return ln_hidden_states, ln_hidden_states, residual
+        elif self.num_ln == 2:
+            ln_attn, residual = self.ln_attn(hidden_states, residual)
+            ln_mlp, _ = self.ln_mlp(residual)
+            return ln_attn, ln_mlp, residual
+
+
 class FlashRWLargeLayer(nn.Module):
     def __init__(self, layer_id, config, weights):
         super().__init__()
         prefix = f"transformer.h.{layer_id}"
-        self.ln_attn = FastLayerNorm.load(
-            prefix=f"{prefix}.ln_attn",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.ln_mlp = FastLayerNorm.load(
-            prefix=f"{prefix}.ln_mlp",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
+
+        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
 
         self.self_attention = FlashRWLargeAttention(
             config,
@@ -507,8 +546,8 @@ class FlashRWLargeLayer(nn.Module):
         input_lengths,
         max_s,
     ):
-        ln_attn, residual = self.ln_attn(hidden_states, residual)
-        ln_mlp, _ = self.ln_mlp(residual)
+        # Layer norm.
+        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
 
         # Self attention.
         attn_output = self.self_attention(

From e3d765645a914c7adda801594a3011546e4dccf9 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 14 May 2024 12:33:18 +0200
Subject: [PATCH 10/12] MLPSpeculator. (#1865)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: Joshua Rosenkranz <joshua.rosenkranz@gmail.com>
---
 .../text_generation_server/layers/__init__.py |   2 +-
 .../text_generation_server/layers/medusa.py   |  35 ++--
 server/text_generation_server/layers/mlp.py   | 176 ++++++++++++++++++
 .../layers/speculative.py                     |  43 +++--
 .../text_generation_server/models/__init__.py | 148 ++++++++++-----
 server/text_generation_server/models/bloom.py |   4 +-
 .../models/causal_lm.py                       |   6 +-
 .../models/custom_modeling/idefics2.py        |   4 +-
 .../models/custom_modeling/llava_next.py      |   2 +-
 .../models/flash_causal_lm.py                 |   2 +
 .../models/flash_cohere.py                    |   4 +-
 .../models/flash_dbrx.py                      |   4 +-
 .../models/flash_gemma.py                     |   4 +-
 .../models/flash_llama.py                     |   4 +-
 .../models/flash_mistral.py                   |   8 +-
 .../models/flash_mixtral.py                   |   4 +-
 .../models/flash_neox.py                      |   4 +-
 .../models/flash_phi.py                       |  16 +-
 .../models/flash_qwen2.py                     |   4 +-
 .../text_generation_server/models/flash_rw.py |   4 +-
 .../models/flash_santacoder.py                |   4 +-
 .../models/flash_starcoder2.py                |   4 +-
 .../models/galactica.py                       |   4 +-
 .../text_generation_server/models/gpt_neox.py |   4 +-
 .../text_generation_server/models/idefics.py  |   4 +-
 .../text_generation_server/models/idefics2.py |   4 +-
 .../models/llava_next.py                      |   4 +-
 server/text_generation_server/models/mamba.py |   4 +-
 server/text_generation_server/models/mpt.py   |   4 +-
 server/text_generation_server/models/opt.py   |   4 +-
 server/text_generation_server/models/phi.py   |   4 +-
 server/text_generation_server/models/rw.py    |   4 +-
 .../models/santacoder.py                      |   2 +-
 .../models/seq2seq_lm.py                      |   6 +-
 server/text_generation_server/models/t5.py    |   4 +-
 35 files changed, 399 insertions(+), 139 deletions(-)
 create mode 100644 server/text_generation_server/layers/mlp.py

diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py
index c3a6c921..c29dd092 100644
--- a/server/text_generation_server/layers/__init__.py
+++ b/server/text_generation_server/layers/__init__.py
@@ -3,11 +3,11 @@ from text_generation_server.layers.tensor_parallel import (
     TensorParallelRowLinear,
     TensorParallelEmbedding,
 )
-from text_generation_server.layers.speculative import SpeculativeHead
 from text_generation_server.layers.linear import (
     get_linear,
     FastLinear,
 )
+from text_generation_server.layers.speculative import SpeculativeHead
 
 # Just to add the `load` methods.
 from text_generation_server.layers.layernorm import load_layer_norm
diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py
index 4ac86978..2e9a010f 100644
--- a/server/text_generation_server/layers/medusa.py
+++ b/server/text_generation_server/layers/medusa.py
@@ -69,21 +69,24 @@ class MedusaHeadV1(nn.Module):
         from safetensors import safe_open
         import json
 
-        use_medusa = config.use_medusa
+        speculator = config.speculator
 
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+        path = speculator["path"]
+        medusa_config = str(Path(path) / "config.json")
 
-        with open(medusa_config, "r") as f:
-            medusa_config = json.load(f)
-        routing = weights.routing
-        with safe_open(filename, framework="pytorch") as f:
-            for k in f.keys():
-                if k in routing and routing[k] != filename:
-                    raise RuntimeError(
-                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                    )
-                routing[k] = filename
+        for fname in speculator["model_paths"]:
+            filename = str(Path(path) / fname)
+
+            with open(medusa_config, "r") as f:
+                medusa_config = json.load(f)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
 
         medusa = MedusaModel(config, medusa_config, weights)
         lm_head = TensorParallelHead.load(config, prefix, weights)
@@ -108,10 +111,10 @@ class MedusaHeadV2(nn.Module):
         from safetensors import safe_open
         import json
 
-        use_medusa = config.use_medusa
+        speculator = config.speculator
 
-        medusa_config = str(Path(use_medusa) / "config.json")
-        filename = str(Path(use_medusa) / "medusa_lm_head.safetensors")
+        medusa_config = str(Path(speculator) / "config.json")
+        filename = str(Path(speculator) / "medusa_lm_head.safetensors")
 
         with open(medusa_config, "r") as f:
             medusa_config = json.load(f)
diff --git a/server/text_generation_server/layers/mlp.py b/server/text_generation_server/layers/mlp.py
new file mode 100644
index 00000000..f08cb673
--- /dev/null
+++ b/server/text_generation_server/layers/mlp.py
@@ -0,0 +1,176 @@
+import torch
+import math
+from torch import nn
+from torch.nn import functional as F
+from typing import Optional, Tuple
+from text_generation_server.layers import TensorParallelEmbedding, FastLinear
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.utils.speculate import get_speculate
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    elementwise_scale_weight : torch.Tensor
+        learned scaling term after normalization?
+    elementwise_shift_bias : torch.Tensor
+        learned bias term after normalization?
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        prefix,
+        config,
+        weights,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = weights.get_tensor(f"{prefix}.weight")
+        self.bias = weights.get_tensor(f"{prefix}.bias")
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+class MLPSpeculatorModel(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+        self.emb = nn.ModuleList(
+            [
+                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.proj = [
+            FastLinear.load(
+                config,
+                prefix=f"{prefix}.proj.{i}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_predict)
+        ]
+        self.head = nn.ModuleList(
+            [
+                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.ln = nn.ModuleList(
+            [
+                MLPSpeculatorLayerNorm(
+                    prefix=f"{prefix}.ln.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(self.n_predict)
+            ]
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict)
+        self.emb_weight = math.sqrt(1 - self.state_weight**2)
+        self.activation = nn.GELU()
+        # TODO
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb[i](ind)
+            z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2))  # b k d
+            state = self.proj[i](state) * self.state_weight + z
+            state = self.activation(self.ln[i](state))  # b k d
+            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
+class MLPSpeculatorHead(nn.Module):
+    def __init__(self, lm_head, mlp_speculator):
+        super().__init__()
+        self.lm_head = lm_head
+        self.mlp_speculator = mlp_speculator
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        input_ids = logits.argmax(dim=-1)
+        speculative_logits = self.mlp_speculator(input, input_ids)
+        return logits, speculative_logits
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+
+        speculator_path = config.speculator["path"]
+
+        for fname in config.speculator["model_paths"]:
+            filename = str(Path(speculator_path) / fname)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MLPSpeculatorHead(lm_head, mlp_speculator)
diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py
index 663f8c2e..4b977a56 100644
--- a/server/text_generation_server/layers/speculative.py
+++ b/server/text_generation_server/layers/speculative.py
@@ -1,34 +1,51 @@
 import torch
+import json
 from typing import Tuple, Optional
-from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
 from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.mlp import MLPSpeculatorHead
 
 
 class SpeculativeHead(torch.nn.Module):
-    def __init__(self, lm_head, medusa):
+    def __init__(self, lm_head, speculator):
         super().__init__()
         self.head = lm_head
-        self.medusa = medusa
+        self.speculator = speculator
 
     @staticmethod
     def load(config, prefix: str, weights):
-        use_medusa = config.use_medusa
-        if use_medusa:
-            lm_head = None
+        speculator = config.speculator
+        if speculator:
+            speculator_path = config.speculator["path"]
+            speculator_config = str(speculator_path / "config.json")
+
+            with open(speculator_config, "r") as f:
+                speculator_config = json.load(f)
+
+            config.speculator_config = speculator_config
             try:
-                medusa = MedusaHeadV1.load(config, prefix, weights)
-            except:
-                medusa = MedusaHeadV2(config, prefix, weights)
+                architecture = speculator_config["architectures"][0]
+
+                if architecture == "MLPSpeculatorPreTrainedModel":
+                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
+                else:
+                    speculator = None
+            except KeyError:
+                try:
+                    speculator = MedusaHeadV1.load(config, prefix, weights)
+                except:
+                    speculator = MedusaHeadV2(config, prefix, weights)
+            lm_head = None
         else:
             lm_head = TensorParallelHead.load(config, prefix, weights)
-            medusa = None
-        return SpeculativeHead(lm_head, medusa)
+            speculator = None
+        return SpeculativeHead(lm_head, speculator)
 
     def forward(
         self, input: torch.Tensor
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.medusa is not None:
-            return self.medusa(input)
+        if self.speculator is not None:
+            return self.speculator(input)
 
         assert self.head is not None
         logits = self.head(input)
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index b52765d7..e9761dfe 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1,9 +1,10 @@
 import torch
+import os
 
 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional
 from pathlib import Path
 
@@ -135,8 +136,9 @@ def get_model(
     config_dict, _ = PretrainedConfig.get_config_dict(
         model_id, revision=revision, trust_remote_code=trust_remote_code
     )
+    model_type = config_dict.get("model_type", None)
 
-    use_medusa = None
+    speculator = None
     if "medusa_num_heads" in config_dict:
         medusa_model_id = model_id
         medusa_revision = revision
@@ -156,6 +158,8 @@ def get_model(
         config_dict, _ = PretrainedConfig.get_config_dict(
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
         is_local = Path(medusa_model_id).exists()
         if not is_local:
             medusa_config = hf_hub_download(
@@ -166,11 +170,70 @@ def get_model(
                 revision=medusa_revision,
                 filename="medusa_lm_head.safetensors",
             )
-            use_medusa = Path(medusa_config).parent
+            speculator = {
+                "path": Path(medusa_config).parent,
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
         else:
-            use_medusa = Path(medusa_model_id)
+            speculator = {
+                "path": Path(medusa_model_id),
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
 
         method = "medusa"
+    elif model_type == "mlp_speculator":
+        mlp_model_id = model_id
+        mlp_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_mlp = config_dict["n_predict"]
+        if speculate is not None:
+            if speculate > speculate_mlp:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_mlp)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(mlp_model_id).exists()
+        extension = ".safetensors"
+        if not is_local:
+            mlp_speculator_config = hf_hub_download(
+                mlp_model_id, revision=mlp_revision, filename="config.json"
+            )
+            api = HfApi()
+            info = api.model_info(mlp_model_id, revision=mlp_revision)
+            filenames = [
+                s.rfilename
+                for s in info.siblings
+                if s.rfilename.endswith(extension)
+                and len(s.rfilename.split("/")) == 1
+                and "arguments" not in s.rfilename
+                and "args" not in s.rfilename
+                and "training" not in s.rfilename
+            ]
+            for filename in filenames:
+                hf_hub_download(
+                    mlp_model_id,
+                    revision=mlp_revision,
+                    filename=filename,
+                )
+            speculator = {
+                "path": Path(mlp_speculator_config).parent,
+                "model_paths": filenames,
+            }
+        else:
+            speculator = Path(mlp_model_id)
+            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
+            speculator = {"path": speculator, "model_paths": filenames}
+        method = "mlp_speculator"
     else:
         method = "n-gram"
 
@@ -178,7 +241,6 @@ def get_model(
     if speculate > 0:
         logger.info(f"Using speculation {method} with {speculate} input ids.")
 
-    model_type = config_dict.get("model_type", None)
     if model_type is None:
         # TODO: fix how we determine model type for Mamba
         if "ssm_cfg" in config_dict:
@@ -202,7 +264,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -212,7 +274,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -227,7 +289,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -240,7 +302,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -250,7 +312,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -259,7 +321,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -270,7 +332,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -279,7 +341,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -288,7 +350,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -299,7 +361,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -308,7 +370,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -323,7 +385,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -334,7 +396,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -345,7 +407,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -355,7 +417,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -366,7 +428,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -377,7 +439,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -388,7 +450,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -399,7 +461,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -410,7 +472,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -424,7 +486,7 @@ def get_model(
                     model_id,
                     revision,
                     quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                     dtype=dtype,
                     trust_remote_code=trust_remote_code,
                 )
@@ -435,7 +497,7 @@ def get_model(
                     model_id,
                     revision,
                     quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                     dtype=dtype,
                     trust_remote_code=trust_remote_code,
                 )
@@ -444,7 +506,7 @@ def get_model(
                     model_id,
                     revision,
                     quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                     dtype=dtype,
                     trust_remote_code=trust_remote_code,
                 )
@@ -458,7 +520,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -469,7 +531,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -483,7 +545,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -494,7 +556,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -520,7 +582,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -544,7 +606,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -554,7 +616,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -564,7 +626,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -574,7 +636,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -586,7 +648,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -599,7 +661,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -623,7 +685,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -632,7 +694,7 @@ def get_model(
             model_id,
             revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -644,7 +706,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
@@ -653,7 +715,7 @@ def get_model(
                 model_id,
                 revision,
                 quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 67129ec3..1e3dd10c 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -42,7 +42,7 @@ class BLOOMSharded(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -71,7 +71,7 @@ class BLOOMSharded(CausalLM):
         )
         config.pad_token_id = 3
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 93ec6ba4..81a02163 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -482,12 +482,12 @@ class CausalLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        if use_medusa:
-            raise RuntimeError("Medusa decoding is not enabled for AutoModel")
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
 
         if torch.cuda.is_available():
             device = torch.device("cuda")
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index 935f049b..51fd7c02 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -683,9 +683,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
     def __init__(self, prefix, config, weights):
         super().__init__()
         config.vision_config.quantize = config.quantize
-        config.vision_config.use_medusa = config.use_medusa
+        config.vision_config.speculator = config.speculator
         config.text_config.quantize = config.quantize
-        config.text_config.use_medusa = config.use_medusa
+        config.text_config.speculator = config.speculator
 
         vision_config = config.vision_config
         self.text_model = load_text_model(
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index a049f756..de9673aa 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -135,7 +135,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
         self.vocab_size = config.text_config.vocab_size
         self.config = config
         config.text_config.quantize = config.quantize
-        config.text_config.use_medusa = config.use_medusa
+        config.text_config.speculator = config.speculator
         self.language_model = load_text_model(
             prefix="language_model" if not prefix else f"{prefix}.language_model",
             config=config.text_config,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index f567bea9..5aa7a568 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1101,6 +1101,8 @@ class FlashCausalLM(Model):
             next_token_texts = []
             left = 0
 
+            logger.info(f"Accepted ids {n_accepted_ids}")
+
             current_stopped = False
             for j in range(index, index + n_accepted_ids):
                 # Generated token
diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py
index f85c7722..b907ee08 100644
--- a/server/text_generation_server/models/flash_cohere.py
+++ b/server/text_generation_server/models/flash_cohere.py
@@ -24,7 +24,7 @@ class FlashCohere(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -49,7 +49,7 @@ class FlashCohere(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/flash_dbrx.py b/server/text_generation_server/models/flash_dbrx.py
index 367d3db0..d5eb1a6e 100644
--- a/server/text_generation_server/models/flash_dbrx.py
+++ b/server/text_generation_server/models/flash_dbrx.py
@@ -26,7 +26,7 @@ class FlashDbrx(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -74,7 +74,7 @@ class FlashDbrx(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py
index 7259b820..9c00a056 100644
--- a/server/text_generation_server/models/flash_gemma.py
+++ b/server/text_generation_server/models/flash_gemma.py
@@ -25,7 +25,7 @@ class FlashGemma(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -50,7 +50,7 @@ class FlashGemma(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index 8ea70713..796fbd47 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -27,7 +27,7 @@ class FlashLlama(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -71,7 +71,7 @@ class FlashLlama(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
index 48304ad8..b83f49a4 100644
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -313,7 +313,7 @@ class BaseFlashMistral(FlashCausalLM):
         config_cls=AutoConfig,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
         tokenizer_class=AutoTokenizer,
@@ -340,7 +340,7 @@ class BaseFlashMistral(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         # Set context windows
         if getattr(config, "sliding_window", None) is not None:
@@ -567,7 +567,7 @@ class FlashMistral(BaseFlashMistral):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -577,7 +577,7 @@ class FlashMistral(BaseFlashMistral):
             model_id=model_id,
             revision=revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/flash_mixtral.py b/server/text_generation_server/models/flash_mixtral.py
index 2ee35e82..587d423f 100644
--- a/server/text_generation_server/models/flash_mixtral.py
+++ b/server/text_generation_server/models/flash_mixtral.py
@@ -15,7 +15,7 @@ class FlashMixtral(BaseFlashMistral):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -25,7 +25,7 @@ class FlashMixtral(BaseFlashMistral):
             model_id=model_id,
             revision=revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index 1119bdae..adefaeb2 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -25,7 +25,7 @@ class FlashNeoXSharded(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -51,7 +51,7 @@ class FlashNeoXSharded(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py
index cb55f9e6..32b573a9 100644
--- a/server/text_generation_server/models/flash_phi.py
+++ b/server/text_generation_server/models/flash_phi.py
@@ -25,7 +25,7 @@ class FlashPhi(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -48,7 +48,7 @@ class FlashPhi(FlashCausalLM):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
@@ -58,7 +58,7 @@ class FlashPhi(FlashCausalLM):
             weights._set_gptq_params(model_id, revision)
 
         model = FlashPhiForCausalLM(config, weights)
-        if use_medusa:
+        if speculator:
             from text_generation_server.utils.medusa import MedusaModel
             from huggingface_hub import hf_hub_download
             import json
@@ -66,19 +66,19 @@ class FlashPhi(FlashCausalLM):
             from pathlib import Path
 
             is_local_model = (
-                Path(use_medusa).exists() and Path(use_medusa).is_dir()
+                Path(speculator).exists() and Path(speculator).is_dir()
             ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None
 
             if not is_local_model:
                 medusa_config = hf_hub_download(
-                    use_medusa, revision=revision, filename="config.json"
+                    speculator, revision=revision, filename="config.json"
                 )
                 medusa_head = hf_hub_download(
-                    use_medusa, revision=revision, filename="medusa_lm_head.pt"
+                    speculator, revision=revision, filename="medusa_lm_head.pt"
                 )
             else:
-                medusa_config = str(Path(use_medusa) / "config.json")
-                medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt")
+                medusa_config = str(Path(speculator) / "config.json")
+                medusa_head = str(Path(speculator) / "medusa_lm_head.pt")
 
             with open(medusa_config, "r") as f:
                 config = json.load(f)
diff --git a/server/text_generation_server/models/flash_qwen2.py b/server/text_generation_server/models/flash_qwen2.py
index cb3cf6b0..59064b30 100644
--- a/server/text_generation_server/models/flash_qwen2.py
+++ b/server/text_generation_server/models/flash_qwen2.py
@@ -30,7 +30,7 @@ class FlashQwen2(BaseFlashMistral):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -53,7 +53,7 @@ class FlashQwen2(BaseFlashMistral):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         # Set context windows
         if config.sliding_window is not None:
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
index 33298e1a..e6350611 100644
--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -26,7 +26,7 @@ class FlashRWSharded(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -66,7 +66,7 @@ class FlashRWSharded(FlashCausalLM):
         )
 
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         if config.quantize == "gptq":
             weights._set_gptq_params(model_id, revision)
 
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 66698a3a..2ad36b93 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -29,7 +29,7 @@ class FlashSantacoderSharded(FlashCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -57,7 +57,7 @@ class FlashSantacoderSharded(FlashCausalLM):
             trust_remote_code=True,
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         config.transpose = config.architectures[0].startswith("GPT2")
 
         torch.distributed.barrier(group=self.process_group)
diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py
index 68e726d8..dc5d49be 100644
--- a/server/text_generation_server/models/flash_starcoder2.py
+++ b/server/text_generation_server/models/flash_starcoder2.py
@@ -29,7 +29,7 @@ class FlashStarcoder2(BaseFlashMistral):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -52,7 +52,7 @@ class FlashStarcoder2(BaseFlashMistral):
             model_id, revision=revision, trust_remote_code=trust_remote_code
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         # Set context windows
         if config.sliding_window is not None:
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index a46f86be..4656fd45 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -167,7 +167,7 @@ class GalacticaSharded(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -195,7 +195,7 @@ class GalacticaSharded(CausalLM):
         )
         config.quantize = quantize
         tokenizer.pad_token_id = config.pad_token_id
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index 1c4cfe7d..c0e1adf2 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -24,7 +24,7 @@ class GPTNeoxSharded(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -51,7 +51,7 @@ class GPTNeoxSharded(CausalLM):
             trust_remote_code=trust_remote_code,
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
index 30bf4aa6..c1fe03e4 100644
--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
@@ -31,7 +31,7 @@ class IDEFICSSharded(IdeficsCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -52,7 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM):
             trust_remote_code=trust_remote_code,
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         config.vision_config.quantize = quantize
 
         tokenizer = LlamaTokenizerFast.from_pretrained(
diff --git a/server/text_generation_server/models/idefics2.py b/server/text_generation_server/models/idefics2.py
index e831af89..314c0500 100644
--- a/server/text_generation_server/models/idefics2.py
+++ b/server/text_generation_server/models/idefics2.py
@@ -18,7 +18,7 @@ class Idefics2(VlmCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -35,7 +35,7 @@ class Idefics2(VlmCausalLM):
             model_id=model_id,
             revision=revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/llava_next.py b/server/text_generation_server/models/llava_next.py
index 3983bc85..effe8b91 100644
--- a/server/text_generation_server/models/llava_next.py
+++ b/server/text_generation_server/models/llava_next.py
@@ -18,7 +18,7 @@ class LlavaNext(VlmCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -30,7 +30,7 @@ class LlavaNext(VlmCausalLM):
             model_id=model_id,
             revision=revision,
             quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
index 0884317e..b28b744f 100644
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -408,7 +408,7 @@ class Mamba(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -445,7 +445,7 @@ class Mamba(Model):
         tokenizer.pad_token = tokenizer.eos_token
 
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device, dtype, process_group=self.process_group)
diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py
index 6b3f29a6..8d8b4909 100644
--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@@ -43,7 +43,7 @@ class MPTSharded(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -76,7 +76,7 @@ class MPTSharded(CausalLM):
             config = json.load(f)
         config = PretrainedConfig(**config)
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
index 703e5b58..5b84f4ff 100644
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -22,7 +22,7 @@ class OPTSharded(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -48,7 +48,7 @@ class OPTSharded(CausalLM):
             trust_remote_code=trust_remote_code,
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         tokenizer.pad_token_id = config.pad_token_id
 
         torch.distributed.barrier(group=self.process_group)
diff --git a/server/text_generation_server/models/phi.py b/server/text_generation_server/models/phi.py
index cc4e2505..d68866c1 100644
--- a/server/text_generation_server/models/phi.py
+++ b/server/text_generation_server/models/phi.py
@@ -22,7 +22,7 @@ class Phi(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -53,7 +53,7 @@ class Phi(CausalLM):
         tokenizer.pad_token = tokenizer.eos_token
 
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(filenames, device, dtype, process_group=self.process_group)
diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
index 92c93542..d4764ded 100644
--- a/server/text_generation_server/models/rw.py
+++ b/server/text_generation_server/models/rw.py
@@ -12,11 +12,11 @@ class RW(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        if use_medusa:
+        if speculator:
             raise RuntimeError("Medusa decoding is not enabled for AutoModel")
 
         if torch.cuda.is_available():
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
index 73c21cce..323e4324 100644
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -19,7 +19,7 @@ class SantaCoder(CausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index e55a661c..6a0c812f 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -532,12 +532,12 @@ class Seq2SeqLM(Model):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        if use_medusa:
-            raise RuntimeError("Medusa decoding is not enabled for AutoModel")
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
 
         if torch.cuda.is_available():
             device = torch.device("cuda")
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 3f3cb965..8e0735e5 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -25,7 +25,7 @@ class T5Sharded(Seq2SeqLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -43,7 +43,7 @@ class T5Sharded(Seq2SeqLM):
             trust_remote_code=trust_remote_code,
         )
         config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,

From 33bc7212afd2bbf168c6735cd66c96a6cbf69c4e Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 14 May 2024 18:15:56 +0200
Subject: [PATCH 11/12] Fixing truncation. (#1890)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
---
 router/src/validation.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/router/src/validation.rs b/router/src/validation.rs
index be4bef00..f85b169c 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -119,7 +119,11 @@ impl Validation {
         // If we have a fast tokenizer
         if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
             // Create response channel
-            let input_length = encoding.len();
+            let input_length = if let Some(truncate) = truncate {
+                std::cmp::min(encoding.len(), truncate)
+            } else {
+                encoding.len()
+            };
 
             // Get total tokens
             let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {

From 92f1338b84d12e9a0fd870c37a29706c3416a53d Mon Sep 17 00:00:00 2001
From: Brandon Lockaby <gbrandon@gmail.com>
Date: Tue, 14 May 2024 14:23:39 -0400
Subject: [PATCH 12/12] Correct 'using guidance' link (#1892)

Fix typo in link to 'using guidance' article
---
 docs/source/conceptual/guidance.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conceptual/guidance.md b/docs/source/conceptual/guidance.md
index 0ce34f2f..a566c4a6 100644
--- a/docs/source/conceptual/guidance.md
+++ b/docs/source/conceptual/guidance.md
@@ -76,7 +76,7 @@ There are two main ways to use guidance; you can either use the `/generate` endp
 
 Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
 
-Please refer to [using guidance](../basic_tutorial/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
+Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
 
 ### Getting the most out of guidance