Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:
env:
PRIMUS_TURBO_COMMIT: 5233748e9c5c5795a6484ab31ece47c442d29ec2 # feat(mxfp4): refactor gemm mxfp4 and mxfp8. fuse transpose, hadamard transform and quantization. (#195)
ROCSHMEM_COMMIT: 17ff985c026f9f97f85068647e863ab541dd5645 # Update version to 3.2.0 for 7.2.0 rocm release (#351) (#355)
UCCL_COMMIT: 5afb4117893c58cc0c8557d9286336141a301053 # [EP]: fix fp8 error of internode_ll on amd gfx950 arch. (#710)
BASE_IMAGE: docker.io/rocm/primus:v26.1
MAXTEXT_BASE_IMAGE: docker.io/rocm/jax-training:maxtext-v25.9

Expand Down Expand Up @@ -101,6 +102,7 @@ jobs:
--build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \
--build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} \
--build-arg PRIMUS_TURBO_FRAMEWORK=PYTORCH \
--build-arg UCCL_COMMIT=${UCCL_COMMIT} \
$GITHUB_WORKSPACE/.github/workflows/docker
end_time=$(date +%s)
elapsed=$((end_time - start_time))
Expand Down Expand Up @@ -139,6 +141,7 @@ jobs:
--build-arg BASE_IMAGE=${MAXTEXT_BASE_IMAGE} \
--build-arg PRIMUS_TURBO_COMMIT=${PRIMUS_TURBO_COMMIT} \
--build-arg PRIMUS_TURBO_FRAMEWORK=JAX \
--build-arg UCCL_COMMIT=${UCCL_COMMIT} \
--build-arg ROCSHMEM_COMMIT=${ROCSHMEM_COMMIT} .
end_time=$(date +%s)
elapsed=$((end_time - start_time))
Expand Down
19 changes: 17 additions & 2 deletions .github/workflows/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ FROM ${BASE_IMAGE}
ARG PRIMUS_TURBO_COMMIT
ARG PRIMUS_TURBO_FRAMEWORK
ARG ROCSHMEM_COMMIT

ARG UCCL_COMMIT
# Non-interactive APT
ENV DEBIAN_FRONTEND=noninteractive

# ---------------------------------------------------------------------------
# Install build dependencies
# ---------------------------------------------------------------------------
RUN apt-get update && \
apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl&& \
apt-get install -y rdma-core libibverbs-dev libnuma-dev numactl libgoogle-glog-dev && \
apt-get install -y --reinstall binutils

RUN rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -59,6 +59,21 @@ RUN cd /opt && \

RUN rm -rf /opt/Primus-Turbo

# ---------------------------------------------------------------------------
# Install UCCL-EP (skip for JAX framework)
# ---------------------------------------------------------------------------
RUN if [ "$PRIMUS_TURBO_FRAMEWORK" != "JAX" ]; then \
cd /opt && \
git clone https://github.com/uccl-project/uccl.git && \
cd uccl && \
git checkout ${UCCL_COMMIT} && \
cd ep && TORCH_CUDA_ARCH_LIST="gfx942,gfx950" python3 setup.py build && cd .. && \
cp ep/build/**/*.so uccl && \
pip3 install --no-build-isolation . -v && \
cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v && \
rm -rf /opt/uccl; \
fi

# Set the default working directory
WORKDIR /opt

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10}
# 5 - Sync-free MoE (stage 1)
# 6 - CPU NUMA binding helper
# 7 - Manual GC helper
# 8 - Using UCCL-EP
# MoE_Features=(0 7)
# MoE_Features=(3 7)
# MoE_Features=(3 4 7)
# MoE_Features=(3 4 5 7)
MoE_Features=(3 4 5 6 7)
MoE_Features=(3 4 5 6 7 8)

FEATURE_ARGS=()
PRIMUS_TURBO_ENABLED="False"
Expand Down Expand Up @@ -133,6 +134,9 @@ for feature in "${MoE_Features[@]}"; do
FEATURE_ARGS+=("--manual_gc" "True")
FEATURE_ARGS+=("--manual_gc_interval" "1")
;;
8)
export USING_UEP=1
;;
*) ;;
esac
done
Expand Down
6 changes: 5 additions & 1 deletion examples/moe_package/run_deepseek_v2_pretrain_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,13 @@ export TRAIN_ITERS=${TRAIN_ITERS:-10}
# 5 - Sync-free MoE (stage 1/2)
# 6 - CPU NUMA binding helper
# 7 - Manual GC helper
# 8 - Using UCCL-EP
if [ -z "${MoE_Features}" ]; then
# MoE_Features=(0 7)
# MoE_Features=(3 7)
# MoE_Features=(3 4 7)
# MoE_Features=(3 4 6 7)
MoE_Features=(3 4 5 6 7)
MoE_Features=(3 4 5 6 7 8)
else
# Convert string to array
# shellcheck disable=SC2128
Expand Down Expand Up @@ -136,6 +137,9 @@ for feature in "${MoE_Features[@]}"; do
FEATURE_ARGS+=("--manual_gc" "True")
FEATURE_ARGS+=("--manual_gc_interval" "1")
;;
8)
export USING_UEP=1
;;
*) ;;
esac
done
Expand Down
5 changes: 5 additions & 0 deletions examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ done < <(env | grep "^HIPBLASLT_")
while IFS='=' read -r name _; do
ENV_ARGS+=("--env" "$name")
done < <(env | grep "^PRIMUS_")
while IFS='=' read -r name _; do
ENV_ARGS+=("--env" "$name")
done < <(env | grep "^UCCL_")
while IFS='=' read -r name _; do
ENV_ARGS+=("--env" "$name")
done < <(env | grep "^NCCL_")
Expand Down Expand Up @@ -184,6 +187,8 @@ docker_podman_proxy run --rm \
--env MAXTEXT_PATH \
--env BACKEND_PATH \
--env REBUILD_PRIMUS_TURBO \
--env REBUILD_UCCL \
--env USING_UEP \
"${ENV_ARGS[@]}" \
--ipc=host --network=host \
--device=/dev/kfd --device=/dev/dri \
Expand Down
76 changes: 76 additions & 0 deletions examples/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,82 @@ else
LOG_INFO "Skip Primus Turbo rebuild. REBUILD_PRIMUS_TURBO=$REBUILD_PRIMUS_TURBO"
fi

# ----------------- Rebuild UCCL -----------------
export REBUILD_UCCL=${REBUILD_UCCL:-0}
if [ "$REBUILD_UCCL" == "1" ]; then
LOG_INFO "Rebuilding UCCL from source..."
apt update && apt install -y rdma-core libibverbs-dev libnuma-dev libgoogle-glog-dev
mkdir -p "/workspace/"
cd "/workspace" || exit

# Clean up old directory if exists to avoid git clone conflicts
if [ -d "uccl" ]; then
LOG_INFO "Removing existing uccl directory..."
rm -rf uccl
fi

git clone https://github.com/uccl-project/uccl.git
cd uccl || exit
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rebuild path clones UCCL from main without pinning to a commit/tag, which makes runs non-reproducible and can break unexpectedly over time. Consider supporting a UCCL_COMMIT/UCCL_REF env var (similar to CI) and checking out that ref when provided.

Suggested change
cd uccl || exit
cd uccl || exit
# Optionally pin UCCL to a specific ref/commit for reproducible rebuilds.
# If UCCL_REF is not set, UCCL_COMMIT can be used as a fallback.
UCCL_CHECKOUT_REF="${UCCL_REF:-$UCCL_COMMIT}"
if [ -n "${UCCL_CHECKOUT_REF}" ]; then
LOG_INFO "Checking out UCCL ref: ${UCCL_CHECKOUT_REF}"
git checkout "${UCCL_CHECKOUT_REF}" || exit
fi

Copilot uses AI. Check for mistakes.
cd ep && python3 setup.py build && cd ..
cp ep/build/**/*.so uccl
pip3 install --no-build-isolation .
cd ep/deep_ep_wrapper && pip3 install --no-build-isolation . -v
cd "${PRIMUS_PATH}" || exit
LOG_INFO "Rebuilding UCCL from source done."
else
LOG_INFO "Skip UCCL rebuild. REBUILD_UCCL=$REBUILD_UCCL"
fi

# ----------------- Using UCCL-EP -----------------
if [ "$USING_UEP" == "1" ]; then
LOG_INFO "USING_UEP is enabled, checking required packages..."

if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then
LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1."
exit 1
fi
LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)"
LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)"

if [ "$ENABLE_NUMA_BINDING" != "1" ]; then
LOG_INFO "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly."
fi

export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP
LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP"


# network settings for UCCL
export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX}
export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA}
export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME}

# set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2
if [ "$USING_AINIC" == "1" ]; then
export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1}
export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1}
export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB
elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2
# FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT`
export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1}
export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1}
export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864}
fi


LOG_INFO "==========UCCL Network Settings=========="
LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX"
LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA"
LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES"
LOG_INFO ""
else
export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO
LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO"
fi

# nvte debug envs
export NVTE_DEBUG=0 # 0, 1
export NVTE_DEBUG_LEVEL=0 # 0, 1, 2
Expand Down
6 changes: 5 additions & 1 deletion primus/modules/trainer/megatron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,5 +536,9 @@ def validate_args_on_rocm(args):
assert (
args.moe_router_dtype == "fp32"
), "DeepEP only supports float32 probs, please set `moe_router_dtype=fp32`"
if args.expert_model_parallel_size >= 16:
if (
args.expert_model_parallel_size >= 16
and os.getenv("PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND", "DEEP_EP") == "TURBO"
):
Comment on lines +539 to +542
Copy link

Copilot AI Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This backend-dependent validation is easy to regress because it depends on an environment variable and specific expert_model_parallel_size thresholds. Add a unit/integration test that exercises validate_args_on_rocm for both PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO and =DEEP_EP (and unset), verifying the assertion behavior around turbo_deepep_num_cu when expert_model_parallel_size >= 16.

Copilot uses AI. Check for mistakes.
# Turbo DeepEP is not supported for CUs > 32 when using internode dispatch/combine.
assert args.turbo_deepep_num_cu <= 32, "Set `turbo_deepep_num_cu<=32` when using ep_size >= 16."
4 changes: 1 addition & 3 deletions runner/helpers/hooks/04_rebuild_uccl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@ fi
UCCL_DIR="/tmp/uccl"
UCCL_BUILD_DIR="${UCCL_BUILD_DIR:-/tmp/uccl_${HOSTNAME:-$(hostname)}}"
UCCL_REF="${UCCL_REF:-}"
GPU_ARCHS="${GPU_ARCHS:-gfx942;gfx950}"

LOG_INFO_RANK0 "[hook system] REBUILD_UCCL=1 → Building uccl in /tmp "
LOG_INFO_RANK0 " Build directory : ${UCCL_BUILD_DIR}"
LOG_INFO_RANK0 " GPU_ARCHS : ${GPU_ARCHS}"

if [ -d "$UCCL_DIR" ]; then
LOG_INFO_RANK0 "[hook system] Found existed uccl in /tmp, remove it"
Expand All @@ -47,7 +45,7 @@ if [[ -n "$UCCL_REF" ]]; then
fi

LOG_INFO_RANK0 "[hook system] Building uccl ep"
Copy link

Copilot AI Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing PYTORCH_ROCM_ARCH/GPU arch injection can break building ROCm extensions in environments without a visible GPU (common in container builds/CI), or produce binaries targeting the wrong arch. Consider restoring a configurable arch env (e.g., PYTORCH_ROCM_ARCH="${GPU_ARCHS}") with a sane default, or otherwise ensuring the build is deterministic across nodes.

Suggested change
LOG_INFO_RANK0 "[hook system] Building uccl ep"
LOG_INFO_RANK0 "[hook system] Building uccl ep"
# Ensure deterministic ROCm arch selection for extension build
if [[ -z "${PYTORCH_ROCM_ARCH:-}" ]]; then
if [[ -n "${GPU_ARCHS:-}" ]]; then
export PYTORCH_ROCM_ARCH="${GPU_ARCHS}"
else
# Fallback to a sane default ROCm arch if none is provided
export PYTORCH_ROCM_ARCH="gfx90a"
fi
fi
LOG_INFO_RANK0 "[hook system] Using PYTORCH_ROCM_ARCH='${PYTORCH_ROCM_ARCH}' for uccl ep build"

Copilot uses AI. Check for mistakes.
cd ep && PYTORCH_ROCM_ARCH="${GPU_ARCHS}" python3 setup.py build && cd ..
cd ep && python3 setup.py build && cd ..

LOG_INFO_RANK0 "[hook system] Building uccl ep done"

Expand Down
62 changes: 62 additions & 0 deletions runner/helpers/hooks/05_using_uep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
###############################################################################
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
###############################################################################
#
# System hook: enable using UEP settings.
#
# Trigger:
# export USING_UEP=1
#
###############################################################################


if [ "$USING_UEP" == "1" ]; then
LOG_INFO "USING_UEP is enabled, checking required packages..."

if ! python3 -m pip show uccl &>/dev/null || ! python3 -m pip show deep_ep &>/dev/null; then
LOG_ERROR "uccl is not installed! Please use pre-installed primus image or set REBUILD_UCCL=1."
exit 1
fi
LOG_INFO "uccl package is installed: $(python3 -m pip show uccl | grep Version)"
LOG_INFO "deep_ep package is installed: $(python3 -m pip show deep_ep | grep Version)"

if [ "$ENABLE_NUMA_BINDING" != "1" ]; then
LOG_WARN "ENABLE_NUMA_BINDING is not enabled! Please set ENABLE_NUMA_BINDING=1 to avoid dataloader worker exited unexpectedly."
fi

export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=DEEP_EP
LOG_INFO "PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to DEEP_EP"

# network settings for UCCL
export UCCL_IB_GID_INDEX=${UCCL_IB_GID_INDEX:-$NCCL_IB_GID_INDEX}
export UCCL_IB_HCA=${UCCL_IB_HCA:-$NCCL_IB_HCA}
export UCCL_SOCKET_IFNAME=${UCCL_SOCKET_IFNAME:-$NCCL_SOCKET_IFNAME}

# set low latency and normal inflight and bytes to avoid hang on AMD Pollara AI NIC and Broadcom Thor-2
if [ "$USING_AINIC" == "1" ]; then
export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1}
export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1}
export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-4194304} # 4MB
elif [ "$REBUILD_BNXT" == "1" ]; then # Broadcom Thor-2
# FIXME(zhuang12): use `USING_BNXT` for Broadcom Thor-2 maybe better than `REBUILD_BNXT`
export UCCL_IB_MAX_INFLIGHT_NORMAL=${UCCL_IB_MAX_INFLIGHT_NORMAL:-1}
export UCCL_IB_MAX_INFLIGHT_LOW_LATENCY=${UCCL_IB_MAX_INFLIGHT_LOW_LATENCY:-1}
export UCCL_IB_MAX_INFLIGHT_BYTES=${UCCL_IB_MAX_INFLIGHT_BYTES:-1572864}
fi


LOG_INFO "==========UCCL Network Settings=========="
LOG_INFO "UCCL_IB_GID_INDEX: $UCCL_IB_GID_INDEX"
LOG_INFO "UCCL_IB_HCA: $UCCL_IB_HCA"
LOG_INFO "UCCL_SOCKET_IFNAME: $UCCL_SOCKET_IFNAME"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_NORMAL: $UCCL_IB_MAX_INFLIGHT_NORMAL"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_LOW_LATENCY: $UCCL_IB_MAX_INFLIGHT_LOW_LATENCY"
LOG_INFO "UCCL_IB_MAX_INFLIGHT_BYTES: $UCCL_IB_MAX_INFLIGHT_BYTES"
LOG_INFO ""
else
export PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND=TURBO
LOG_INFO "USING_UEP is disabled. PRIMUS_TURBO_MOE_DISPATCH_COMBINE_BACKEND set to TURBO"
fi
34 changes: 34 additions & 0 deletions tests/trainer/test_megatron_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,40 @@ def test_turbo_deepep(self):
],
)

def test_deepseekv2_lite_uep(self):
run_script(
self.__class__.__name__,
"deepseekv2_lite_uep",
exp_path="examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml",
env_override={"USING_UEP": "1", "REBUILD_UCCL": "1"},
extra_args=[
"--num_layers",
"4",
"--train_iters",
"3",
"--micro_batch_size",
"1",
"--global_batch_size",
"8",
"--moe_layer_freq",
"1",
"--expert_model_parallel_size",
"8",
"--use_turbo_deepep",
"1",
"--enable_primus_turbo",
"1",
"--moe_router_dtype",
"fp32",
"--moe_shared_expert_overlap",
"0",
"--moe_use_legacy_grouped_gemm",
"1",
"--turbo_sync_free_moe_stage",
"3",
],
)


class TestMegatronTrainerDeterministic(PrimusUT):
def __init__(self, *args, **kwargs):
Expand Down