From 4316c63c34d2b282a6f11929e850efd8847b264b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:00:28 +0000 Subject: [PATCH 1/7] Add ROCm CI --- .../integration-test-rocm-backup.yml | 62 +++++++++++++++++++ .github/workflows/ut-rocm-backup.yml | 46 ++++++++++++++ docker/base-dev-x.dockerfile | 4 +- docker/build.sh | 4 +- docs/quickstart.md | 4 +- ...ments_cu11.txt => requirements_cuda11.txt} | 0 ...ments_cu12.txt => requirements_cuda12.txt} | 0 python/requirements_rocm6.txt | 6 ++ test/deploy/setup.sh | 4 +- 9 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/integration-test-rocm-backup.yml create mode 100644 .github/workflows/ut-rocm-backup.yml rename python/{requirements_cu11.txt => requirements_cuda11.txt} (100%) rename python/{requirements_cu12.txt => requirements_cuda12.txt} (100%) create mode 100644 python/requirements_rocm6.txt diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml new file mode 100644 index 000000000..016432ab6 --- /dev/null +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -0,0 +1,62 @@ +name: IntegrationTest + +on: workflow_dispatch + +jobs: + IntegrationTest: + runs-on: [ self-hosted, AMD ] + defaults: + run: + shell: bash + strategy: + matrix: + rocm: [ rocm6.0 ] + + container: + image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" + options: --privileged --ipc=host --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --group-add video + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + + - name: Run mscclpp AllGather test + run: | + set -e + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + + - name: Run mscclpp SendRecv test + run: | + set -e + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + + - name: Run mscclpp AllReduce test + run: | + set -e + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + + - name: Run mscclpp AllToAll test + run: | + set -e + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + + - name: Check collective primitives performance + run: | + set -e + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml new file mode 100644 index 000000000..7656b65c1 --- /dev/null +++ b/.github/workflows/ut-rocm-backup.yml @@ -0,0 +1,46 @@ +name: "UnitTest (ROCm)" + +on: workflow_dispatch + +jobs: + UnitTest: + runs-on: [ self-hosted, AMD ] + defaults: + run: + shell: bash + timeout-minutes: 30 + strategy: + matrix: + rocm: [ rocm6.0 ] + + container: + image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" + options: --privileged --ipc=host --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --group-add video + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + working-directory: ${{ github.workspace }} + + - name: UnitTests + run: | + ./build/test/unit_tests + + - name: MpUnitTests + run: | + set -e + mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests + + # - name: PyTests + # run: | + # set -e + # cd build && make pylib-copy + # mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 87d3f5c0d..d7f2166f1 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" ADD . /tmp/mscclpp WORKDIR /tmp/mscclpp ARG TARGET="cuda12.1" -RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \ - python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt +RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ + python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt # Set PATH RUN echo PATH="${PATH}" > /etc/environment diff --git a/docker/build.sh b/docker/build.sh index 5b14bcc4c..98829c741 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -7,6 +7,7 @@ baseImageTable=( ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" + ["rocm6.0"]="rocm/dev-ubuntu-20.04:6.0-complete" ) declare -A extraLdPathTable @@ -14,13 +15,14 @@ extraLdPathTable=( ["cuda11.8"]="/usr/local/cuda-11.8/lib64" ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" + ["rocm6.0"]="/opt/rocm/lib" ) GHCR="ghcr.io/microsoft/mscclpp/mscclpp" TARGET=${1} print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]" + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|rocm6.0]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then diff --git a/docs/quickstart.md b/docs/quickstart.md index f2b12d187..82b6fcff0 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -87,8 +87,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10. [Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. ```bash -# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version. -$ python3 -m pip install -r ./python/requirements_cu12.txt +# Choose `requirements_*.txt` according to your CUDA/ROCm version. +$ python3 -m pip install -r ./python/requirements_cuda12.txt $ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py ``` diff --git a/python/requirements_cu11.txt b/python/requirements_cuda11.txt similarity index 100% rename from python/requirements_cu11.txt rename to python/requirements_cuda11.txt diff --git a/python/requirements_cu12.txt b/python/requirements_cuda12.txt similarity index 100% rename from python/requirements_cu12.txt rename to python/requirements_cuda12.txt diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt new file mode 100644 index 000000000..6536d3a26 --- /dev/null +++ b/python/requirements_rocm6.txt @@ -0,0 +1,6 @@ +mpi4py +prettytable +netifaces +pytest +numpy +matplotlib diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 1d0641773..12022d9a8 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do done if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cu11.txt + pip3 install -r /root/mscclpp/python/requirements_cuda11.txt else - pip3 install -r /root/mscclpp/python/requirements_cu12.txt + pip3 install -r /root/mscclpp/python/requirements_cuda12.txt fi cd /root/mscclpp && pip3 install . From 8edbaf732fe4f26e892ea258173c1c269130b6f2 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:03:29 +0000 Subject: [PATCH 2/7] change triggers --- .github/workflows/integration-test-rocm-backup.yml | 8 +++++++- .github/workflows/ut-rocm-backup.yml | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index 016432ab6..d16ee3ec1 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -1,6 +1,12 @@ name: IntegrationTest -on: workflow_dispatch +on: + push: + branches: + - main + pull_request: + branches: + - main jobs: IntegrationTest: diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml index 7656b65c1..95e0fd246 100644 --- a/.github/workflows/ut-rocm-backup.yml +++ b/.github/workflows/ut-rocm-backup.yml @@ -1,6 +1,12 @@ name: "UnitTest (ROCm)" -on: workflow_dispatch +on: + push: + branches: + - main + pull_request: + branches: + - main jobs: UnitTest: From efd2a5a9c2ba264b07547a3634502da181a1055a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:24:33 +0000 Subject: [PATCH 3/7] no container --- .github/workflows/integration-test-rocm-backup.yml | 4 ---- .github/workflows/ut-rocm-backup.yml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index d16ee3ec1..81baa1454 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -18,10 +18,6 @@ jobs: matrix: rocm: [ rocm6.0 ] - container: - image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" - options: --privileged --ipc=host --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --group-add video - steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml index 95e0fd246..072195117 100644 --- a/.github/workflows/ut-rocm-backup.yml +++ b/.github/workflows/ut-rocm-backup.yml @@ -19,10 +19,6 @@ jobs: matrix: rocm: [ rocm6.0 ] - container: - image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" - options: --privileged --ipc=host --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --group-add video - steps: - name: Checkout uses: actions/checkout@v4 From 1c17268323f3af554ec983fd731576c96269cef9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:28:37 +0000 Subject: [PATCH 4/7] updates --- .../integration-test-rocm-backup.yml | 28 +++++++++---------- .github/workflows/ut-rocm-backup.yml | 8 +++--- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index 81baa1454..333e5db8e 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -31,32 +31,32 @@ jobs: - name: Run mscclpp AllGather test run: | set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - name: Run mscclpp SendRecv test run: | set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - name: Run mscclpp AllReduce test run: | set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - name: Run mscclpp AllToAll test run: | set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - name: Check collective primitives performance run: | diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml index 072195117..9b8f44a2b 100644 --- a/.github/workflows/ut-rocm-backup.yml +++ b/.github/workflows/ut-rocm-backup.yml @@ -37,12 +37,12 @@ jobs: - name: MpUnitTests run: | set -e - mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests + mpirun -np 2 ./build/test/mp_unit_tests + mpirun -np 4 ./build/test/mp_unit_tests + mpirun -np 8 ./build/test/mp_unit_tests # - name: PyTests # run: | # set -e # cd build && make pylib-copy - # mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x + # mpirun -np 8 $(which pytest) ../python/test/test_mscclpp.py -x From 1504bef57affb54c2ae1e6a329ae0105d39f993c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:43:54 +0000 Subject: [PATCH 5/7] updates --- .../integration-test-rocm-backup.yml | 28 +++++++++---------- .github/workflows/ut-rocm-backup.yml | 6 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index 333e5db8e..d2ed011f4 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -31,32 +31,32 @@ jobs: - name: Run mscclpp AllGather test run: | set -e - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - name: Run mscclpp SendRecv test run: | set -e - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - name: Run mscclpp AllReduce test run: | set -e - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - name: Run mscclpp AllToAll test run: | set -e - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - name: Check collective primitives performance run: | diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml index 9b8f44a2b..907de3b06 100644 --- a/.github/workflows/ut-rocm-backup.yml +++ b/.github/workflows/ut-rocm-backup.yml @@ -37,9 +37,9 @@ jobs: - name: MpUnitTests run: | set -e - mpirun -np 2 ./build/test/mp_unit_tests - mpirun -np 4 ./build/test/mp_unit_tests - mpirun -np 8 ./build/test/mp_unit_tests + mpirun -np 2 ./build/test/mp_unit_tests --gtest_filter=-*Ib* + mpirun -np 4 ./build/test/mp_unit_tests --gtest_filter=-*Ib* + mpirun -np 8 ./build/test/mp_unit_tests --gtest_filter=-*Ib* # - name: PyTests # run: | From 94881544418bce0fbcaeaaa513c78a117e6702d3 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:44:15 +0000 Subject: [PATCH 6/7] updates --- .github/workflows/integration-test-rocm-backup.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index d2ed011f4..4f18bd028 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -58,7 +58,7 @@ jobs: mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - - name: Check collective primitives performance - run: | - set -e - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl + # - name: Check collective primitives performance + # run: | + # set -e + # python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl From 7d64f33c559f005d69a4018aa48a5d90cfe7d4a9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 27 Dec 2023 04:56:15 +0000 Subject: [PATCH 7/7] updates --- .github/workflows/integration-test-rocm-backup.yml | 5 ++++- .github/workflows/ut-rocm-backup.yml | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml index 4f18bd028..d3a2c6247 100644 --- a/.github/workflows/integration-test-rocm-backup.yml +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -1,4 +1,4 @@ -name: IntegrationTest +name: "IntegrationTest (ROCm)" on: push: @@ -17,6 +17,9 @@ jobs: strategy: matrix: rocm: [ rocm6.0 ] + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} + cancel-in-progress: true steps: - name: Checkout diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml index 907de3b06..06ffdbf9c 100644 --- a/.github/workflows/ut-rocm-backup.yml +++ b/.github/workflows/ut-rocm-backup.yml @@ -18,6 +18,9 @@ jobs: strategy: matrix: rocm: [ rocm6.0 ] + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} + cancel-in-progress: true steps: - name: Checkout