diff --git a/server/text_generation_server/layers/gptq/exllamav2.py.rej b/server/text_generation_server/layers/gptq/exllamav2.py.rej
deleted file mode 100644
index ff13179b..00000000
--- a/server/text_generation_server/layers/gptq/exllamav2.py.rej
+++ /dev/null
@@ -1,253 +0,0 @@
-diff a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py	(rejected hunks)
-@@ -1,10 +1,15 @@
- # Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
- 
-+from dataclasses import dataclass
-+from typing import Optional
- import torch
- import torch.nn as nn
- 
- from loguru import logger
- 
-+from text_generation_server.layers.exl2 import Exl2Weight
-+from text_generation_server.layers.gptq import GPTQWeight
-+
- try:
-     from exllamav2_kernels import make_q_matrix, gemm_half_q_half
- except ImportError:
-@@ -15,6 +20,15 @@ except ImportError:
- none_tensor = torch.empty((1, 1), device="meta")
- 
- 
-+@dataclass
-+class _ExtraTensors:
-+    """Additional generated quantizer tensors."""
-+
-+    q_group_map: Optional[torch.Tensor] = None
-+    q_invperm: Optional[torch.Tensor] = None
-+    q_perm: Optional[torch.Tensor] = None
-+
-+
- def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-     """Matrix multiplication, returns x @ q4"""
-     output_shape = x.shape[:-1] + (q4_width,)
-@@ -24,11 +38,7 @@ def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-     return output.view(output_shape)
- 
- 
--# Group map needed for irregular group sizes
--
--
--def make_group_map(q_groups, num_qrows):
--
-+def make_group_map(q_groups: torch.Tensor, num_qrows: int):
-     gr = q_groups.tolist()
-     group_map = []
-     num_groups = len(gr) // 2
-@@ -50,72 +60,72 @@ def make_group_map(q_groups, num_qrows):
- # Create Q matrix
- 
- 
--def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
-+def ext_make_q_matrix(
-+    w: Exl2Weight | GPTQWeight,
-+    extra: _ExtraTensors,
-+    temp_dq,
-+    key: Optional[str] = None,
-+):
-     """
-     Create Q matrix
-     """
-     # EXL2
--    # won't work as the moment because the tensors are not the same.
--    if "q_weight" in w:
--        w["q_scale_max"] /= 256
--        w["q_perm"] = w["q_perm"].short()
--        w["q_invperm"] = w["q_invperm"].short()
--
--        if "q_group_map" not in w:
--            w["q_group_map"] = make_group_map(w["q_groups"], w["q_weight"].shape[0])
-+    if isinstance(w, Exl2Weight):
-+        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
-+        extra.q_perm = torch.argsort(w.q_invperm).short()
- 
-         return make_q_matrix(
--            w["q_weight"],
--            w["q_perm"],
--            w["q_invperm"],
--            w["q_scale"],
--            w["q_scale_max"],
--            w["q_groups"],
--            w["q_group_map"],
-+            w.q_weight,
-+            extra.q_perm,
-+            w.q_invperm,
-+            w.q_scale,
-+            w.q_scale_max,
-+            w.q_groups,
-+            extra.q_group_map,
-             none_tensor,
-             none_tensor,
-             none_tensor,
-             temp_dq,
-         )
-     # GPTQ
--    elif "qweight" in w:
--        if w["scales"].dtype == torch.float:
--            w["scales"] = w["scales"].half()
-+    elif isinstance(w, GPTQWeight):
-+        if w.scales.dtype == torch.float:
-+            w.scales = w.scales.half()
- 
-         # GPTQ with g_idx (act_order)
--        if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
--            w["q_perm"] = torch.empty(
--                (w["qweight"].shape[0] * 8,),
-+        if w.g_idx is not None and not (w.g_idx == 0).all().item():
-+            extra.q_perm = torch.empty(
-+                (w.qweight.shape[0] * 8,),
-                 dtype=torch.short,
--                device=w["qweight"].device,
-+                device=w.qweight.device,
-             )
--            w["q_invperm"] = torch.empty_like(w["q_perm"])
-+            extra.q_invperm = torch.empty_like(extra.q_perm)
-             # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-             return make_q_matrix(
--                w["qweight"],
--                w["q_perm"],
--                w["q_invperm"],
-+                w.qweight,
-+                extra.q_perm,
-+                extra.q_invperm,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
--                w["qzeros"],
--                w["scales"],
--                w["g_idx"].cpu(),
-+                w.qzeros,
-+                w.scales,
-+                w.g_idx.cpu(),
-                 temp_dq,
-             )
-         # GPTQ without g_idx
-         else:
-             return make_q_matrix(
--                w["qweight"],
-+                w.qweight,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
-                 none_tensor,
--                w["qzeros"],
--                w["scales"],
-+                w.qzeros,
-+                w.scales,
-                 none_tensor,
-                 temp_dq,
-             )
-@@ -124,7 +134,6 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
- 
- 
- DEVICE = None
--FIXED_BYTES = 0
- LAYERS = []
- 
- 
-@@ -134,8 +143,13 @@ def set_device(device):
- 
- 
- def create_exllama_buffers(max_total_tokens: int):
--    global FIXED_BYTES, LAYERS, DEVICE
--    temp_dq = ExLlamaV2DeviceTensors(DEVICE, FIXED_BYTES)
-+    global LAYERS, DEVICE
-+
-+    # Find the size of the scratch space.
-+    scratch_bytes = max(
-+        layer.scratch_space_fixed(max_input_len=max_total_tokens) for layer in LAYERS
-+    )
-+    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
- 
-     for layer in LAYERS:
-         layer.post_init(temp_dq)
-@@ -146,49 +160,48 @@ class QuantLinear(nn.Module):
- 
-     """Linear layer implementation with per-group 4-bit quantization of the weights"""
- 
--    # def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
--    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-+    def __init__(
-+        self,
-+        weight: Exl2Weight | GPTQWeight,
-+        bias: torch.Tensor,
-+    ):
-         super().__init__()
--        if bits != 4:
--            raise ValueError(
--                f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
--            )
-+
-         self.q_handle = None
--        self.q_tensors = None
--        self.bits = bits
--        self.maxq = 2**self.bits - 1
--        self.infeatures = qweight.shape[0] // self.bits * 32
--        self.outfeatures = qweight.shape[1]
-+        self.q_tensors = weight
-+        self.extra_tensors = _ExtraTensors()
-+
-+        if isinstance(weight, Exl2Weight):
-+            self.infeatures = weight.q_invperm.shape[0]
-+            self.outfeatures = weight.q_weight.shape[1]
-+        elif isinstance(weight, GPTQWeight):
-+            if weight.bits != 4:
-+                raise ValueError(
-+                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
-+                )
-+
-+            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
-+            self.outfeatures = weight.qweight.shape[1]
-+
-         self.padding = -self.outfeatures % 32
-         self.outfeatures = self.outfeatures + self.padding
- 
--        self.device = qweight.device
--        self.qweight = qweight
--        self.qzeros = qzeros
--        self.scales = scales
--        self.g_idx = g_idx
-+        self.device = weight.device
-         self.bias = bias if bias is not None else None
--        self.group_size = groupsize
- 
--        global FIXED_BYTES, LAYERS
--        FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
-+        global LAYERS
-         LAYERS.append(self)
- 
-     def post_init(self, temp_dq):
--        assert self.qweight.device.type == "cuda"
--        assert self.qweight.device.index is not None
--        self.q_tensors = {
--            "qweight": self.qweight,
--            "qzeros": self.qzeros,
--            "scales": self.scales,
--            "g_idx": self.g_idx,
--        }
-+        device = self.q_tensors.device
-+        assert device.type == "cuda"
-+        assert device.index is not None
-         temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
- 
-         # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
-         # and `Memory access fault by GPU node-2` will EAT you.
-         self.temp_dq = temp_dq
--        self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
-+        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
- 
-     def forward(self, x, force_cuda=False):
-         output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)