Trying back to but EXl2 + TP>1,

the issue might have been cleaned memory by torch allocator
This commit is contained in:
Nicolas Patry 2024-01-26 10:27:17 +00:00
parent 16958fe312
commit 97d9ff3a71
2 changed files with 7 additions and 6 deletions

View File

@ -185,6 +185,7 @@ class QuantLinear(nn.Module):
"g_idx": self.g_idx, "g_idx": self.g_idx,
} }
temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size()) temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
self.temp_dq = temp_dq
self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq) self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
def forward(self, x, force_cuda=False): def forward(self, x, force_cuda=False):

View File

@ -35,12 +35,12 @@ except Exception:
HAS_EXLLAMA = False HAS_EXLLAMA = False
CAN_EXLLAMA = major >= 8 CAN_EXLLAMA = major >= 8
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: # if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1:
V2 = False # V2 = False
log_once( # log_once(
logger.warning, # logger.warning,
"Disabling exllama v2 and using v1 instead because there are issues when sharding", # "Disabling exllama v2 and using v1 instead because there are issues when sharding",
) # )
if os.getenv("DISABLE_EXLLAMA") == "True": if os.getenv("DISABLE_EXLLAMA") == "True":
HAS_EXLLAMA = False HAS_EXLLAMA = False