diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml new file mode 100644 index 000000000..d3a2c6247 --- /dev/null +++ b/.github/workflows/integration-test-rocm-backup.yml @@ -0,0 +1,67 @@ +name: "IntegrationTest (ROCm)" + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + IntegrationTest: + runs-on: [ self-hosted, AMD ] + defaults: + run: + shell: bash + strategy: + matrix: + rocm: [ rocm6.0 ] + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + + - name: Run mscclpp AllGather test + run: | + set -e + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + + - name: Run mscclpp SendRecv test + run: | + set -e + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + + - name: Run mscclpp AllReduce test + run: | + set -e + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + + - name: Run mscclpp AllToAll test + run: | + set -e + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + + # - name: Check collective primitives performance + # run: | + # set -e + # python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml new file mode 100644 index 000000000..06ffdbf9c --- /dev/null +++ b/.github/workflows/ut-rocm-backup.yml @@ -0,0 +1,51 @@ +name: "UnitTest (ROCm)" + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + UnitTest: + runs-on: [ self-hosted, AMD ] + defaults: + run: + shell: bash + timeout-minutes: 30 + strategy: + matrix: + rocm: [ rocm6.0 ] + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + working-directory: ${{ github.workspace }} + + - name: UnitTests + run: | + ./build/test/unit_tests + + - name: MpUnitTests + run: | + set -e + mpirun -np 2 ./build/test/mp_unit_tests --gtest_filter=-*Ib* + mpirun -np 4 ./build/test/mp_unit_tests --gtest_filter=-*Ib* + mpirun -np 8 ./build/test/mp_unit_tests --gtest_filter=-*Ib* + + # - name: PyTests + # run: | + # set -e + # cd build && make pylib-copy + # mpirun -np 8 $(which pytest) ../python/test/test_mscclpp.py -x diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 87d3f5c0d..d7f2166f1 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" ADD . /tmp/mscclpp WORKDIR /tmp/mscclpp ARG TARGET="cuda12.1" -RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \ - python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt +RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ + python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt # Set PATH RUN echo PATH="${PATH}" > /etc/environment diff --git a/docker/build.sh b/docker/build.sh index 5b14bcc4c..98829c741 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -7,6 +7,7 @@ baseImageTable=( ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" + ["rocm6.0"]="rocm/dev-ubuntu-20.04:6.0-complete" ) declare -A extraLdPathTable @@ -14,13 +15,14 @@ extraLdPathTable=( ["cuda11.8"]="/usr/local/cuda-11.8/lib64" ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" + ["rocm6.0"]="/opt/rocm/lib" ) GHCR="ghcr.io/microsoft/mscclpp/mscclpp" TARGET=${1} print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]" + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|rocm6.0]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then diff --git a/docs/quickstart.md b/docs/quickstart.md index f2b12d187..82b6fcff0 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -87,8 +87,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10. [Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. ```bash -# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version. -$ python3 -m pip install -r ./python/requirements_cu12.txt +# Choose `requirements_*.txt` according to your CUDA/ROCm version. +$ python3 -m pip install -r ./python/requirements_cuda12.txt $ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py ``` diff --git a/python/requirements_cu11.txt b/python/requirements_cuda11.txt similarity index 100% rename from python/requirements_cu11.txt rename to python/requirements_cuda11.txt diff --git a/python/requirements_cu12.txt b/python/requirements_cuda12.txt similarity index 100% rename from python/requirements_cu12.txt rename to python/requirements_cuda12.txt diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt new file mode 100644 index 000000000..6536d3a26 --- /dev/null +++ b/python/requirements_rocm6.txt @@ -0,0 +1,6 @@ +mpi4py +prettytable +netifaces +pytest +numpy +matplotlib diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 1d0641773..12022d9a8 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do done if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cu11.txt + pip3 install -r /root/mscclpp/python/requirements_cuda11.txt else - pip3 install -r /root/mscclpp/python/requirements_cu12.txt + pip3 install -r /root/mscclpp/python/requirements_cuda12.txt fi cd /root/mscclpp && pip3 install .