Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docker/Dockerfile.rocm7.gfx950
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ RUN pip install click==8.2.1
########################################
########################################

########################################
# Install iproute2 (ip route command)
########################################
RUN apt update && apt install iproute2 -y
########################################
########################################

WORKDIR /app

Expand Down
7 changes: 7 additions & 0 deletions examples/train_infer_mismatch_helper/mi355-readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Download the models and datasets:
huggingface-cli download Qwen/Qwen3-4B --local-dir models/Qwen/Qwen3-4B
huggingface-cli download Qwen/Qwen3-30B-A3B --local-dir models/Qwen/Qwen3-30B-A3B
huggingface-cli download moonshotai/Moonlight-16B-A3B --local-dir models/moonshotai/Moonlight-16B-A3B
huggingface-cli download Qwen/Qwen3-235B-A22B --local-dir models/Qwen/Qwen3-235B-A22B

# train/eval data download
# dapo
Expand Down Expand Up @@ -81,6 +82,9 @@ PYTHONPATH=${MEGATRON_LM_PATH} python tools/convert_hf_to_torch_dist.py ${MODEL_
--hf-checkpoint models/moonshotai/Moonlight-16B-A3B \
--save models/moonshotai/Moonlight-16B-A3B_torch_dist \
--trust-remote-code

# convert hf checkpoint to torch dist for qwen3-235b-a22b (slurm launcher)
./examples/train_infer_mismatch_helper/qwen3_235b-a22b/convert/run_slurm_convert_model.sh
```


Expand All @@ -102,6 +106,9 @@ bash examples/train_infer_mismatch_helper/mi355-run-moonlight-16b-gsm8k-mis.sh
# dapo17k train + aime2024 eval
bash examples/train_infer_mismatch_helper/mi355-run-moonlight-16b-a3b-mis.sh

# multi-node qwen3-245b-a22b
NNODES=4 ./examples/train_infer_mismatch_helper/qwen3_235b-a22b/train-gsm8k-mis/run_slurm_train.sh

```

^ Make sure to double check the data/model paths, gpu-memory settings before launching. Currently the scripts use
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/bin/bash

SCRIPT_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
SLIME_PATH=$(realpath "${SCRIPT_DIR}/../../../..")

HOSTNAME=$(hostname)

LOG_INFO() {
if [ "$*" = "" ]; then
echo ""
else
echo "[NODE-$NODE_RANK($HOSTNAME)] [INFO] $*"
fi
}

LOG_INFO_RANK0() {
if [ "$NODE_RANK" -eq 0 ]; then
if [ "$*" = "" ]; then
echo ""
else
echo "[NODE-$NODE_RANK($HOSTNAME)] [INFO] $*"
fi
fi
}

LOG_ERROR() {
echo "[NODE-$NODE_RANK($HOSTNAME)] [ERROR] $*";
}

export MASTER_ADDR=${MASTER_ADDR:-localhost}
export MASTER_PORT=${MASTER_PORT:-1234}
export NNODES=${NNODES:-1}
export NODE_RANK=${NODE_RANK:-0}
export GPUS_PER_NODE=${GPUS_PER_NODE:-8}

LOG_INFO_RANK0 "==========Training cluster info=========="
LOG_INFO_RANK0 "MASTER_ADDR: $MASTER_ADDR"
LOG_INFO_RANK0 "MASTER_PORT: $MASTER_PORT"
LOG_INFO_RANK0 "NNODES: $NNODES"
LOG_INFO_RANK0 "NODE_RANK: $NODE_RANK"
LOG_INFO_RANK0 "GPUS_PER_NODE: $GPUS_PER_NODE"
LOG_INFO_RANK0 ""

# ----------------- NCCL and Network Settings -----------------
# VERSION, WARN, INFO, DEBUG, TRACE
export NCCL_DEBUG=${NCCL_DEBUG:-}

# Disable NCCL internal checks to reduce overhead
export NCCL_CHECKS_DISABLE=1

# Using tensor model parallelism or context parallelism require
# setting the environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1
export CUDA_DEVICE_MAX_CONNECTIONS=1


export NCCL_IB_GID_INDEX=3

# Disable cross NIC communication for NCCL
export NCCL_CROSS_NIC=0

# Dynamically get InfiniBand Host Channel Adapter index for NCCL if not set
if [ -z "${NCCL_IB_HCA}" ]; then
NCCL_IB_HCA=$(bash "${SLIME_PATH}/examples/train_infer_mismatch_helper/tools/get_nccl_ib_hca.sh")
fi
export NCCL_IB_HCA

# Dynamically get network interface IP address for socket communication if not set
if [ -z "${IP_INTERFACE}" ]; then
IP_INTERFACE=$(bash "${SLIME_PATH}/examples/train_infer_mismatch_helper/tools/get_ip_interface.sh")
fi
export IP_INTERFACE

# Set network interfaces for NCCL and Gloo, fallback to detected IP_INTERFACE
export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$IP_INTERFACE}
export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$IP_INTERFACE}

LOG_INFO_RANK0 "==========NCCL and Network Settings=========="
LOG_INFO_RANK0 "NCCL_DEBUG: $NCCL_DEBUG"
LOG_INFO_RANK0 "NCCL_CHECKS_DISABLE: $NCCL_CHECKS_DISABLE"
LOG_INFO_RANK0 "CUDA_DEVICE_MAX_CONNECTIONS: $CUDA_DEVICE_MAX_CONNECTIONS"
LOG_INFO_RANK0 "NCCL_IB_GID_INDEX: $NCCL_IB_GID_INDEX"
LOG_INFO_RANK0 "NCCL_CROSS_NIC: $NCCL_CROSS_NIC"
LOG_INFO "NCCL_IB_HCA: $NCCL_IB_HCA"
LOG_INFO "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME"
LOG_INFO "GLOO_SOCKET_IFNAME: $GLOO_SOCKET_IFNAME"
LOG_INFO ""

# install slime
echo "RANK-${NODE_RANK}, Installing slime..."
cd $SLIME_PATH && pip install -e . 2>&1 > /dev/null
echo "RANK-${NODE_RANK}, Installing slime done..."

# disable torch.dist patch in megatron
echo "RANK-${NODE_RANK}, Disabling torch.dist patch in megatron..."
# TODO(wenx)
# cd $SLIME_PATH/bak && ./patch.sh && cd ..
echo "RANK-${NODE_RANK}, Disabling torch.dist patch in megatron done..."


DISTRIBUTED_ARGS=(
--nproc_per_node "${GPUS_PER_NODE}"
--nnodes "${NNODES}"
--node_rank "${NODE_RANK}"
--master_addr "${MASTER_ADDR}"
--master_port "${MASTER_PORT}"
)

echo "RANK-${NODE_RANK}, Converting Qwen3-235B-A22B-FP8..."

cd $SLIME_PATH
source ${SLIME_PATH}/scripts/models/qwen3-235B-A22B.sh
PYTHONPATH=/app/Megatron-LM torchrun \
${DISTRIBUTED_ARGS[*]} \
${SLIME_PATH}/tools/convert_hf_to_torch_dist.py \
${MODEL_ARGS[@]} \
--no-gradient-accumulation-fusion \
--hf-checkpoint ${SLIME_PATH}/models/Qwen/Qwen3-235B-A22B \
--save ${SLIME_PATH}/models/Qwen/Qwen3-235B-A22B_torch_dist \
--trust-remote-code \
2>&1 | tee ${LOG_DIR}/log_convert_qwen3-235B-A22B-${NODE_RANK}.log
# --tensor-model-parallel-size 4 \

# PYTHONPATH=/app/Megatron-LM torchrun \
# ${DISTRIBUTED_ARGS[*]} \
# ${SLIME_PATH}/tools/convert_hf_to_torch_dist.py \
# ${MODEL_ARGS[@]} \
# --tensor-model-parallel-size 1 \
# --pipeline-model-parallel-size 4 \
# --expert-model-parallel-size 8 \
# --expert-tensor-parallel-size 1 \
# --decoder-last-pipeline-num-layers 22 \
# --no-gradient-accumulation-fusion \
# --hf-checkpoint ${SLIME_PATH}/models/Qwen/Qwen3-235B-A22B-FP8 \
# --save ${SLIME_PATH}/models/Qwen/Qwen3-235B-A22B-FP8_torch_dist \
# --trust-remote-code \
# 2>&1 | tee ${LOG_DIR}/log_convert_qwen3-235B-A22B-FP8-${NODE_RANK}.log
# # --tensor-model-parallel-size 4 \
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash

SCRIPT_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
SLIME_PATH=$(realpath "${SCRIPT_DIR}/../../../..")

export DOCKER_IMAGE=${DOCKER_IMAGE:-"docker.io/grameshamd/miles-slime-rocm7-mi35x:mla-fix"}
export CLEAN_DOCKER_CONTAINER=${CLEAN_DOCKER_CONTAINER:-1}

# ------------------ Cluster Env Defaults ------------------
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-1234}
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}

if [ "$NODE_RANK" = "0" ]; then
echo "========== Cluster info =========="
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"
echo "NNODES: $NNODES"
echo "GPUS_PER_NODE: $GPUS_PER_NODE"
echo ""
fi

VOLUME_ARGS=(-v "$SLIME_PATH":"$SLIME_PATH")

# ------------------ Optional Container Cleanup ------------------
docker_podman_proxy() {
if command -v podman &>/dev/null; then
podman "$@"
elif command -v docker &>/dev/null; then
docker "$@"
else
echo "Neither Docker nor Podman found!" >&2
return 1
fi
}

if [[ "${CLEAN_DOCKER_CONTAINER:-0}" == "1" ]]; then
echo "Node-${NODE_RANK}: Cleaning up existing containers..."
CONTAINERS=$(docker_podman_proxy ps -aq)
if [[ -n "$CONTAINERS" ]]; then
for cid in $CONTAINERS; do
docker_podman_proxy rm -f "$cid"
done
echo "Node-${NODE_RANK}: Removed containers: $CONTAINERS"
else
echo "Node-${NODE_RANK}: No containers to remove."
fi
fi

if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then
echo "Node-${NODE_RANK}: Skipping training container launch."
exit 0
else
echo "Node-${NODE_RANK}: Launching training container."
fi

# ------------------ Launch Training Container ------------------
docker_podman_proxy run --rm \
--env MASTER_ADDR \
--env MASTER_PORT \
--env NNODES \
--env NODE_RANK \
--env GPUS_PER_NODE \
--env LOG_DIR \
"${ENV_ARGS[@]}" \
--ipc=host --network=host \
--device=/dev/kfd --device=/dev/dri \
--cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN \
--security-opt seccomp=unconfined --group-add video \
--privileged --device=/dev/infiniband \
"${VOLUME_ARGS[@]}" \
"$DOCKER_IMAGE" /bin/bash -c "\
echo '[NODE-${NODE_RANK}(${HOSTNAME})]: begin, time=$(date +"%Y.%m.%d %H:%M:%S")' && \
rm /etc/apt/sources.list.d/rocm.list && sudo apt update 2>&1 > /dev/null && \
sudo apt install iproute2 -y 2>&1 > /dev/null && \
sed -i '/import torch/a import warnings' /app/Megatron-LM/megatron/core/model_parallel_config.py && \
cd $SLIME_PATH && \
bash ${SCRIPT_DIR}/run_convert_model.sh \"\$@\" 2>&1 && \
echo '[NODE-${NODE_RANK}(${HOSTNAME})]: end, time=$(date +"%Y.%m.%d %H:%M:%S")'
" bash "${ARGS[@]}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

export NNODES=${NNODES:-1}
export MASTER_PORT=${MASTER_PORT:-12345}

SCRIPT_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
SLIME_PATH=$(realpath "${SCRIPT_DIR}/../../../..")
echo "SLIME_PATH: $SLIME_PATH"

export LOG_DIR=${LOG_DIR:-"${SLIME_PATH}/logs/qwen3_235b-a22b/convert"}
LOG_FILE="${LOG_DIR}/log_slurm_convert_model.txt"
mkdir -p "$LOG_DIR"

srun -N "${NNODES}" \
--exclusive \
--export ALL \
--ntasks-per-node=1 \
--cpus-per-task="${CPUS_PER_TASK:-96}" \
bash -c "
readarray -t node_array < <(scontrol show hostnames \"\$SLURM_JOB_NODELIST\")
if [ \"\$SLURM_NODEID\" = \"0\" ]; then
echo \"========== Slurm cluster info ==========\"
echo \"SLURM_NODELIST: \${node_array[*]}\"
echo \"SLURM_NNODES: \${SLURM_NNODES}\"
echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\"
echo \"\"
fi
export SLURM_NODELIST=\${node_array[*]}
export MASTER_ADDR=\${node_array[0]}
export MASTER_PORT=\${MASTER_PORT}
export NNODES=\${SLURM_NNODES}
export NODE_RANK=\${SLURM_PROCID}
export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE}
bash ${SCRIPT_DIR}/run_local_convert_model.sh \"\$@\" 2>&1 | tee ${LOG_FILE}
" bash "$@"
Loading
Loading