meta-pytorch · fmassa · Mar 16, 2026 · Mar 13, 2026
diff --git a/autoparallel/graph_passes/debug_helpers.py b/autoparallel/graph_passes/debug_helpers.py
@@ -125,11 +125,20 @@ def custom_runtime_estimation(node: torch.fx.Node, override_size=None):
             target = node.target
             if target == torch.ops._c10d_functional.wait_tensor.default:
                 return 0
-            # TODO: figure out mesh without reading from global scope
-            mesh_topo = MeshTopoInfo.build_from_mesh(mesh)
-            groups_name = tuple(g.group_name for g in mesh.get_all_groups())
+            # Resolve group_name to a mesh and dimension. Collectives may
+            # use per-dimension groups or the flattened mesh group created
+            # by mesh._flatten() in _apply_placement_common.
             group_name = get_group_name(node)
-            mesh_dim = groups_name.index(group_name)
+            groups_name = tuple(g.group_name for g in mesh.get_all_groups())
+            if group_name in groups_name:
+                cost_mesh = mesh
+                mesh_dim = groups_name.index(group_name)
+            elif mesh.ndim > 1:
+                cost_mesh = mesh._flatten()
+                mesh_dim = 0
+            else:
+                return 0
+            mesh_topo = MeshTopoInfo.build_from_mesh(cost_mesh)
             t = node.args[0].meta["val"]  # type: ignore[union-attr]
             comm_bytes_gb = t.numel() * t.itemsize / 2**30
             if override_size is not None:
@@ -138,7 +147,7 @@ def custom_runtime_estimation(node: torch.fx.Node, override_size=None):
                 torch.ops._c10d_functional.all_gather_into_tensor.default,
                 torch.ops._c10d_functional.all_gather_into_tensor_out.default,
             }:
-                comm_bytes_gb *= mesh.shape[mesh_dim]
+                comm_bytes_gb *= cost_mesh.shape[mesh_dim]
                 return allgather_cost(comm_bytes_gb, mesh_topo, mesh_dim)
             elif target == torch.ops._c10d_functional.reduce_scatter_tensor.default:
                 return reduce_scatter_cost(comm_bytes_gb, mesh_topo, mesh_dim)