From 020578fd99b6f38239deed23a10b16490994e36f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 9 Oct 2025 06:42:42 +0000
Subject: [PATCH 1/8] Initial plan


From 16ce1d8de8c148c125d627238e98bf190c74641e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 9 Oct 2025 06:49:29 +0000
Subject: [PATCH 2/8] Add preamble function to reset locks in examples 10 and
 11

Co-authored-by: neoblizz <9790745+neoblizz@users.noreply.github.com>
---
 .../10_gemm_all_scatter_wg_specialization/benchmark.py     | 7 ++++++-
 .../11_gemm_all_scatter_producer_consumer/benchmark.py     | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/10_gemm_all_scatter_wg_specialization/benchmark.py b/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
index bb49bacb..515acd40 100755
--- a/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
@@ -135,6 +135,11 @@ def _worker(local_rank: int, world_size: int, init_url: str, args: dict):
     # Allocate Timestamps
     timestamps = Timestamps(num_tiles=total_tiles)
 
+    def preamble():
+        shmem.barrier()
+        locks.zero_()
+        shmem.barrier()
+
     def run_experiment():
         nonlocal local_C
         nonlocal global_C
@@ -222,7 +227,7 @@ def run_experiment():
         matmul.set_debug(False)
         shmem.info("Benchmarking...")
         perf = lambda ms: 2 * args["M"] * args["N"] * args["K"] * 1e-12 / (ms * 1e-3)
-        triton_ms = iris.do_bench(run_experiment, shmem.barrier)
+        triton_ms = iris.do_bench(run_experiment, shmem.barrier, preamble)
         triton_tflops = perf(triton_ms)
         algo_string = "all_scatter"
         shmem.info(
diff --git a/examples/11_gemm_all_scatter_producer_consumer/benchmark.py b/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
index 8059c26f..264d5f79 100755
--- a/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
@@ -144,6 +144,11 @@ def _worker(local_rank: int, world_size: int, init_url: str, args: dict):
     # Allocate Timestamps
     timestamps = Timestamps(num_tiles=total_tiles)
 
+    def preamble():
+        shmem.barrier()
+        locks.zero_()
+        shmem.barrier()
+
     def run_experiment():
         nonlocal C
         nonlocal kernel_timing
@@ -253,7 +258,7 @@ def run_experiment():
         matmul.set_debug(False)
         shmem.info("Benchmarking...")
         perf = lambda ms: 2 * args["M"] * args["N"] * args["K"] * 1e-12 / (ms * 1e-3)
-        triton_ms = iris.do_bench(run_experiment, shmem.barrier)
+        triton_ms = iris.do_bench(run_experiment, shmem.barrier, preamble)
         triton_tflops = perf(triton_ms)
         algo_string = "all_scatter"
         shmem.info(

From 8d15de7602af8c0f51beb25c3621420d13aa742f Mon Sep 17 00:00:00 2001
From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com>
Date: Thu, 9 Oct 2025 15:28:07 -0700
Subject: [PATCH 3/8] Use proper synchronization instead of loads and stores
 (#215)

---
 examples/10_gemm_all_scatter_wg_specialization/benchmark.py  | 2 +-
 .../gemm_all_scatter_wg_specialization.py                    | 5 ++---
 examples/11_gemm_all_scatter_producer_consumer/benchmark.py  | 2 +-
 .../gemm_all_scatter_producer_consumer.py                    | 5 ++---
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/examples/10_gemm_all_scatter_wg_specialization/benchmark.py b/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
index af44fd8a..1b8417dc 100755
--- a/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/benchmark.py
@@ -136,7 +136,7 @@ def _worker(local_rank: int, world_size: int, init_url: str, args: dict):
     total_blocks_N = triton.cdiv(args["n"], args["BLK_N"])
     total_tiles = total_blocks_M * total_blocks_N
 
-    locks = shmem.zeros((total_tiles,), device="cuda", dtype=torch.int8)
+    locks = shmem.zeros((total_tiles,), device="cuda", dtype=torch.int32)
 
     bias = None
 
diff --git a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
index ac2d2e35..cf6b1ca5 100644
--- a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
@@ -142,8 +142,7 @@ def persistent_gemm_all_scatter_wg_specialization(
                 tl.atomic_max(mm_end_timestamp_ptr + tile_id, timestamp)
 
             tl.store(c_global + global_offset, c, mask=sub_mask, cache_modifier=".wt")
-            tl.debug_barrier()
-            tl.store(locks + tile_id, 1, cache_modifier=".wt")
+            tl.atomic_xchg(locks + tile_id, 1, sem="release", scope="gpu")
 
     else:  # pid >= GEMM_SMS
         COMM_SMS = NUM_SMS - GEMM_SMS
@@ -165,7 +164,7 @@ def persistent_gemm_all_scatter_wg_specialization(
             global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
             # End: masks/offset calculations.
 
-            while tl.load(locks + tile_id, cache_modifier=".cv", volatile=True) != 1:
+            while tl.atomic_xchg(locks + tile_id, 0, sem="acquire", scope="gpu") != 1:
                 pass
 
             for remote_rank in range(world_size):
diff --git a/examples/11_gemm_all_scatter_producer_consumer/benchmark.py b/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
index 6b92c728..561a37dc 100755
--- a/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/benchmark.py
@@ -136,7 +136,7 @@ def _worker(local_rank: int, world_size: int, init_url: str, args: dict):
     total_blocks_N = triton.cdiv(args["n"], args["BLK_N"])
     total_tiles = total_blocks_M * total_blocks_N
 
-    locks = shmem.zeros((total_tiles,), device="cuda", dtype=torch.int8)
+    locks = shmem.zeros((total_tiles,), device="cuda", dtype=torch.int32)
 
     bias = None
 
diff --git a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
index a8311943..87d7f675 100644
--- a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
@@ -133,8 +133,7 @@ def persistent_gemm(
             tl.atomic_max(mm_end_timestamp_ptr + tile_id, timestamp)
 
         tl.store(C + global_offset, c, mask=sub_mask, cache_modifier=".wt")
-        tl.debug_barrier()
-        tl.store(locks + tile_id, 1, cache_modifier=".wt")
+        tl.atomic_xchg(locks + tile_id, 1, sem="release", scope="gpu")
 
 
 @triton.jit()
@@ -185,7 +184,7 @@ def persistent_all_scatter(
         global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
         # End: masks/offset calculations.
 
-        while tl.load(locks + tile_id, cache_modifier=".cv", volatile=True) != 1:
+        while tl.atomic_xchg(locks + tile_id, 0, sem="acquire", scope="gpu") != 1:
             pass
 
         for remote_rank in range(world_size):

From a5ec30c5edaa67c3013ccad2f632a1fdf20a6df9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Oct 2025 01:01:26 +0000
Subject: [PATCH 4/8] Fix atomic operations: use atomic_cas for non-destructive
 polling

Co-authored-by: neoblizz <9790745+neoblizz@users.noreply.github.com>
---
 .../gemm_all_scatter_wg_specialization.py                       | 2 +-
 .../gemm_all_scatter_producer_consumer.py                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
index cf6b1ca5..643768f7 100644
--- a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
@@ -164,7 +164,7 @@ def persistent_gemm_all_scatter_wg_specialization(
             global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
             # End: masks/offset calculations.
 
-            while tl.atomic_xchg(locks + tile_id, 0, sem="acquire", scope="gpu") != 1:
+            while tl.atomic_cas(locks + tile_id, 0, 0, sem="acquire", scope="gpu") != 1:
                 pass
 
             for remote_rank in range(world_size):
diff --git a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
index 87d7f675..51bd0bd8 100644
--- a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
@@ -184,7 +184,7 @@ def persistent_all_scatter(
         global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
         # End: masks/offset calculations.
 
-        while tl.atomic_xchg(locks + tile_id, 0, sem="acquire", scope="gpu") != 1:
+        while tl.atomic_cas(locks + tile_id, 0, 0, sem="acquire", scope="gpu") != 1:
             pass
 
         for remote_rank in range(world_size):

From f32e69a5254ffd00bb4058a935b034fae8b6e05b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Oct 2025 03:32:55 +0000
Subject: [PATCH 5/8] Fix atomic_cas polling: use (1, 0) to atomically consume
 the flag

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .../gemm_all_scatter_wg_specialization.py                      | 3 ++-
 .../gemm_all_scatter_producer_consumer.py                      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
index 643768f7..5652a4ec 100644
--- a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
@@ -164,7 +164,8 @@ def persistent_gemm_all_scatter_wg_specialization(
             global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
             # End: masks/offset calculations.
 
-            while tl.atomic_cas(locks + tile_id, 0, 0, sem="acquire", scope="gpu") != 1:
+            # Poll until lock is set to 1, then atomically reset it to 0
+            while tl.atomic_cas(locks + tile_id, 1, 0, sem="acquire", scope="gpu") != 1:
                 pass
 
             for remote_rank in range(world_size):
diff --git a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
index 51bd0bd8..b101c85b 100644
--- a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
@@ -184,7 +184,8 @@ def persistent_all_scatter(
         global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
         # End: masks/offset calculations.
 
-        while tl.atomic_cas(locks + tile_id, 0, 0, sem="acquire", scope="gpu") != 1:
+        # Poll until lock is set to 1, then atomically reset it to 0
+        while tl.atomic_cas(locks + tile_id, 1, 0, sem="acquire", scope="gpu") != 1:
             pass
 
         for remote_rank in range(world_size):

From 7479b4be49b4e03dd4b1c8635d64de916b42f316 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Oct 2025 04:30:15 +0000
Subject: [PATCH 6/8] Use volatile load spin loop followed by atomic_cas with
 dependency

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .../gemm_all_scatter_wg_specialization.py                | 9 ++++++---
 .../gemm_all_scatter_producer_consumer.py                | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
index 5652a4ec..aac520da 100644
--- a/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
+++ b/examples/10_gemm_all_scatter_wg_specialization/gemm_all_scatter_wg_specialization.py
@@ -164,9 +164,12 @@ def persistent_gemm_all_scatter_wg_specialization(
             global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
             # End: masks/offset calculations.
 
-            # Poll until lock is set to 1, then atomically reset it to 0
-            while tl.atomic_cas(locks + tile_id, 1, 0, sem="acquire", scope="gpu") != 1:
-                pass
+            # Spin on volatile load until flag is non-zero (cheap)
+            flag_val = tl.load(locks + tile_id, cache_modifier=".cv", volatile=True)
+            while flag_val == 0:
+                flag_val = tl.load(locks + tile_id, cache_modifier=".cv", volatile=True)
+            # Use atomic_cas with dependency on loaded value to prevent reordering
+            tl.atomic_cas(locks + tile_id, flag_val, 0, sem="acquire", scope="gpu")
 
             for remote_rank in range(world_size):
                 if remote_rank != cur_rank:
diff --git a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
index b101c85b..3620f061 100644
--- a/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
+++ b/examples/11_gemm_all_scatter_producer_consumer/gemm_all_scatter_producer_consumer.py
@@ -184,9 +184,12 @@ def persistent_all_scatter(
         global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
         # End: masks/offset calculations.
 
-        # Poll until lock is set to 1, then atomically reset it to 0
-        while tl.atomic_cas(locks + tile_id, 1, 0, sem="acquire", scope="gpu") != 1:
-            pass
+        # Spin on volatile load until flag is non-zero (cheap)
+        flag_val = tl.load(locks + tile_id, cache_modifier=".cv", volatile=True)
+        while flag_val == 0:
+            flag_val = tl.load(locks + tile_id, cache_modifier=".cv", volatile=True)
+        # Use atomic_cas with dependency on loaded value to prevent reordering
+        tl.atomic_cas(locks + tile_id, flag_val, 0, sem="acquire", scope="gpu")
 
         for remote_rank in range(world_size):
             if remote_rank != cur_rank:

From c66903c3006d485020a5c7b51626b13e34554a67 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Oct 2025 05:29:44 +0000
Subject: [PATCH 7/8] Add port cleanup to CI tests to prevent lingering
 connections

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/scripts/run_perf_benchmark.sh             | 15 ++++++++++++++-
 .github/scripts/run_tests.sh                      | 12 ++++++++++++
 .../workflows/iris-external-validation-test.yml   |  5 ++++-
 tests/run_tests_distributed.py                    |  5 +++++
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_perf_benchmark.sh b/.github/scripts/run_perf_benchmark.sh
index 7be18d84..ba3b6a26 100755
--- a/.github/scripts/run_perf_benchmark.sh
+++ b/.github/scripts/run_perf_benchmark.sh
@@ -10,6 +10,20 @@ BENCHMARK_ARGS="$@"
 # Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
 OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"
 
+# Cleanup function
+cleanup() {
+    echo "Cleaning up processes and ports..."
+    # Kill any lingering Python processes from this benchmark
+    pkill -9 -f "benchmark.py" 2>/dev/null || true
+    # Give the system time to release ports
+    sleep 1
+    # Cleanup overlay image
+    rm -f "${OVERLAY}" 2>/dev/null || true
+}
+
+# Set trap to ensure cleanup happens on exit (success or failure)
+trap cleanup EXIT INT TERM
+
 echo "::group::Creating overlay image"
 apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
 echo "::endgroup::"
@@ -60,4 +74,3 @@ fi
 
 echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
 echo "::endgroup::"
-
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index fd7b9388..041fcd26 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -13,6 +13,18 @@ if [ -z "$NUM_RANKS" ]; then
     exit 1
 fi
 
+# Function to cleanup ports and processes on exit
+cleanup() {
+    echo "Cleaning up ports and processes..."
+    # Kill any lingering Python processes from this test session
+    pkill -9 -f "run_tests_distributed.py" 2>/dev/null || true
+    # Give the system time to release ports
+    sleep 1
+}
+
+# Set trap to ensure cleanup happens on exit (success or failure)
+trap cleanup EXIT INT TERM
+
 # Run examples tests one at a time using distributed wrapper
 echo 'Running examples tests one at a time...'
 for test_file in tests/examples/test_*.py; do
diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml
index 57c904d3..69a16303 100644
--- a/.github/workflows/iris-external-validation-test.yml
+++ b/.github/workflows/iris-external-validation-test.yml
@@ -67,7 +67,10 @@ jobs:
               set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
-              python test_iris_distributed.py
+              python test_iris_distributed.py || { echo 'Test failed, cleaning up...'; pkill -9 -f python 2>/dev/null || true; exit 1; }
+              # Ensure cleanup of any lingering processes
+              pkill -9 -f python 2>/dev/null || true
+              sleep 1
             "
           echo "::endgroup::"
 
diff --git a/tests/run_tests_distributed.py b/tests/run_tests_distributed.py
index e3254556..bc267fb7 100755
--- a/tests/run_tests_distributed.py
+++ b/tests/run_tests_distributed.py
@@ -110,6 +110,11 @@ def main():
     except Exception:
         # Any other unhandled exception = failure
         sys.exit(1)
+    finally:
+        # Ensure cleanup: wait briefly to allow port to be released
+        import time
+
+        time.sleep(0.5)
 
 
 if __name__ == "__main__":

From 7d7179b72b1970a0a2ac59f631e2bd7165836fd9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Oct 2025 05:34:14 +0000
Subject: [PATCH 8/8] Revert "Add port cleanup to CI tests to prevent lingering
 connections"

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/scripts/run_perf_benchmark.sh             | 15 +--------------
 .github/scripts/run_tests.sh                      | 12 ------------
 .../workflows/iris-external-validation-test.yml   |  5 +----
 tests/run_tests_distributed.py                    |  5 -----
 4 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/.github/scripts/run_perf_benchmark.sh b/.github/scripts/run_perf_benchmark.sh
index ba3b6a26..7be18d84 100755
--- a/.github/scripts/run_perf_benchmark.sh
+++ b/.github/scripts/run_perf_benchmark.sh
@@ -10,20 +10,6 @@ BENCHMARK_ARGS="$@"
 # Create overlay image in workspace (will be auto-cleaned by GitHub Actions)
 OVERLAY="iris_overlay_perf_${EXAMPLE_PATH//\//_}.img"
 
-# Cleanup function
-cleanup() {
-    echo "Cleaning up processes and ports..."
-    # Kill any lingering Python processes from this benchmark
-    pkill -9 -f "benchmark.py" 2>/dev/null || true
-    # Give the system time to release ports
-    sleep 1
-    # Cleanup overlay image
-    rm -f "${OVERLAY}" 2>/dev/null || true
-}
-
-# Set trap to ensure cleanup happens on exit (success or failure)
-trap cleanup EXIT INT TERM
-
 echo "::group::Creating overlay image"
 apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY}"
 echo "::endgroup::"
@@ -74,3 +60,4 @@ fi
 
 echo "✅ Performance test passed! TFLOPs: $TFLOPS (threshold: >$TFLOPS_THRESHOLD)"
 echo "::endgroup::"
+
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 041fcd26..fd7b9388 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -13,18 +13,6 @@ if [ -z "$NUM_RANKS" ]; then
     exit 1
 fi
 
-# Function to cleanup ports and processes on exit
-cleanup() {
-    echo "Cleaning up ports and processes..."
-    # Kill any lingering Python processes from this test session
-    pkill -9 -f "run_tests_distributed.py" 2>/dev/null || true
-    # Give the system time to release ports
-    sleep 1
-}
-
-# Set trap to ensure cleanup happens on exit (success or failure)
-trap cleanup EXIT INT TERM
-
 # Run examples tests one at a time using distributed wrapper
 echo 'Running examples tests one at a time...'
 for test_file in tests/examples/test_*.py; do
diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml
index 69a16303..57c904d3 100644
--- a/.github/workflows/iris-external-validation-test.yml
+++ b/.github/workflows/iris-external-validation-test.yml
@@ -67,10 +67,7 @@ jobs:
               set -e
               pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
               wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
-              python test_iris_distributed.py || { echo 'Test failed, cleaning up...'; pkill -9 -f python 2>/dev/null || true; exit 1; }
-              # Ensure cleanup of any lingering processes
-              pkill -9 -f python 2>/dev/null || true
-              sleep 1
+              python test_iris_distributed.py
             "
           echo "::endgroup::"
 
diff --git a/tests/run_tests_distributed.py b/tests/run_tests_distributed.py
index bc267fb7..e3254556 100755
--- a/tests/run_tests_distributed.py
+++ b/tests/run_tests_distributed.py
@@ -110,11 +110,6 @@ def main():
     except Exception:
         # Any other unhandled exception = failure
         sys.exit(1)
-    finally:
-        # Ensure cleanup: wait briefly to allow port to be released
-        import time
-
-        time.sleep(0.5)
 
 
 if __name__ == "__main__":