From 65db02f192a3753f59cc690bd71f445ced55e209 Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Tue, 9 Jan 2024 17:59:16 +0100 Subject: [PATCH] fix: use TORCH_NCCL_AVOID_RECORD_STREAMS=1 --- launcher/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 4e230205a..115858ab3 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -477,6 +477,7 @@ fn shard_manager( envs.push(("MASTER_ADDR".into(), master_addr.into())); envs.push(("MASTER_PORT".into(), master_port.to_string().into())); envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into())); + envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into())) // CUDA memory fraction envs.push((