diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 4e230205a..115858ab3 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -477,6 +477,7 @@ fn shard_manager( envs.push(("MASTER_ADDR".into(), master_addr.into())); envs.push(("MASTER_PORT".into(), master_port.to_string().into())); envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into())); + envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into())) // CUDA memory fraction envs.push((