From 97d9ff3a7134d03ecf6a12d30af6b1aa3d955573 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 26 Jan 2024 10:27:17 +0000 Subject: [PATCH] Trying back to but EXl2 + TP>1, the issue might have been cleaned memory by torch allocator --- .../text_generation_server/utils/gptq/exllamav2.py | 1 + server/text_generation_server/utils/layers.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/server/text_generation_server/utils/gptq/exllamav2.py b/server/text_generation_server/utils/gptq/exllamav2.py index a24e834b..25ff508b 100644 --- a/server/text_generation_server/utils/gptq/exllamav2.py +++ b/server/text_generation_server/utils/gptq/exllamav2.py @@ -185,6 +185,7 @@ class QuantLinear(nn.Module): "g_idx": self.g_idx, } temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size()) + self.temp_dq = temp_dq self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq) def forward(self, x, force_cuda=False): diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 5a0de0d7..c9393d99 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -35,12 +35,12 @@ except Exception: HAS_EXLLAMA = False CAN_EXLLAMA = major >= 8 V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" -if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: - V2 = False - log_once( - logger.warning, - "Disabling exllama v2 and using v1 instead because there are issues when sharding", - ) +# if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: +# V2 = False +# log_once( +# logger.warning, +# "Disabling exllama v2 and using v1 instead because there are issues when sharding", +# ) if os.getenv("DISABLE_EXLLAMA") == "True": HAS_EXLLAMA = False