fix: use TORCH_NCCL_AVOID_RECORD_STREAMS=1

2025-08-01 04:40:17 +00:00 · 2024-01-09 17:59:16 +01:00 · 2024-01-09 17:59:16 +01:00 · 65db02f192
commit 65db02f192
parent 91d7267534
1 changed files with 1 additions and 0 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -477,6 +477,7 @@ fn shard_manager(
    envs.push(("MASTER_ADDR".into(), master_addr.into()));
    envs.push(("MASTER_PORT".into(), master_port.to_string().into()));
    envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()));
+    envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into()))

    // CUDA memory fraction
    envs.push((