fix: use TORCH_NCCL_AVOID_RECORD_STREAMS=1

2025-09-11 12:24:53 +00:00 · 2024-01-09 17:59:16 +01:00 · 2024-01-09 17:59:16 +01:00 · 1d929a243a
commit 1d929a243a
parent 532146338b
1 changed files with 1 additions and 0 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -489,6 +489,7 @@ fn shard_manager(
    envs.push(("MASTER_ADDR".into(), master_addr.into()));
    envs.push(("MASTER_PORT".into(), master_port.to_string().into()));
    envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()));
+    envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into()))

    // CUDA memory fraction
    envs.push((