diff --git a/primus/tools/preflight/gpu/gpu_topology.py b/primus/tools/preflight/gpu/gpu_topology.py index 168121fa1..e86198764 100644 --- a/primus/tools/preflight/gpu/gpu_topology.py +++ b/primus/tools/preflight/gpu/gpu_topology.py @@ -7,6 +7,7 @@ from __future__ import annotations import os +from collections import Counter from typing import Any, Dict, List, Optional from .gpu_probe import probe_gpus @@ -109,7 +110,10 @@ def run_gpu_standard_checks(*, force_topology: bool = False) -> Dict[str, Any]: findings.append(Finding("warn", "NUMA mapping unavailable (amd-smi not found); skipped", {})) else: nodes = [x.get("numa_node") for x in numa.get("gpus", []) if x.get("numa_node") is not None] - imbalance = len(set(nodes)) > 1 if nodes else False + imbalance = False + if nodes: + counts = Counter(nodes).values() + imbalance = len(set(counts)) > 1 findings.append( Finding("info", "GPU↔NUMA mapping", {"mapping": numa.get("gpus", []), "imbalance": imbalance}) )