From 308ca073741e6cb3c655b0398fd7ac6acbfdbefd Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Thu, 5 Jun 2025 09:55:35 +0000
Subject: [PATCH 1/6] update Megatron DeepSpeed

---
 thirdparty/Megatron-DeepSpeed | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/Megatron-DeepSpeed b/thirdparty/Megatron-DeepSpeed
index 0d6e379..3e1da1f 160000
--- a/thirdparty/Megatron-DeepSpeed
+++ b/thirdparty/Megatron-DeepSpeed
@@ -1 +1 @@
-Subproject commit 0d6e3793a1fc06eded9764ef15ad12bcc0281101
+Subproject commit 3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26

From 0222d1470e542ae1e047419db38a6f73348e7bd8 Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Thu, 5 Jun 2025 09:56:29 +0000
Subject: [PATCH 2/6] update submodule Megatron-LM

---
 thirdparty/Megatron-LM | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/Megatron-LM b/thirdparty/Megatron-LM
index 77b4bfe..957f348 160000
--- a/thirdparty/Megatron-LM
+++ b/thirdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit 77b4bfe00ab2634650345fd485be59a9d9c27272
+Subproject commit 957f3488efd505e5d22f5d5bc46eaa187eeb44cb

From 3067d264b060c79e76b0d63e329890d3b2b91e7b Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Thu, 5 Jun 2025 09:59:15 +0000
Subject: [PATCH 3/6] update
 launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml

---
 .../llm/meg-lm-llama2-70b-xlliu.yaml          | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml

diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
new file mode 100644
index 0000000..f9c8295
--- /dev/null
+++ b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
@@ -0,0 +1,101 @@
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: meg-lm-llama2-70b-alan
+  namespace: default
+spec: 
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+          labels:
+            scitix.ai/topo-aware-in-node: "true"
+        spec: &job-spec
+          tolerations:
+          - key: "scitix.ai/nodecheck"
+            operator: "Exists"
+            effect: "NoSchedule"
+          containers:
+          - args:
+            - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \
+               export SIFLOW_LOG_LEVEL=error && export SIFILE_FS_READ_TYPE=syncio && \
+               pip install /data/zawang/siflowai/siflowai-0.1.0-cp312-cp312-linux_x86_64.whl && \
+               DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \
+               BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \
+               RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \
+               ENABLE_CKPT=1 MOCK_DATA=true \
+               bash /workspace/deep_learning_examples/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh "             
+            command:
+            - /usr/bin/env
+            - bash
+            - -c
+            env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: TZ
+              value: CST-8
+            image: registry-ap-southeast.scitix.ai/hpc/nemo:25.04
+            imagePullPolicy: Always 
+            name: pytorch
+            resources:
+              limits:
+                cpu: "80"
+                memory: 1000Gi
+                nvidia.com/gpu: "8"
+                rdma/hca_shared_devices_all: "1"
+              requests:
+                cpu: "80"
+                memory: 1000Gi
+                nvidia.com/gpu: "8"
+                rdma/hca_shared_devices_all: "1"
+            securityContext:
+              capabilities:
+                add:
+                - IPC_LOCK
+            volumeMounts:
+            - mountPath: /dev/shm
+              name: dev-shm
+            - mountPath: /workspace/deep_learning_examples
+              name: deep-learning-examples
+            - mountPath: /data/zawang/siflowai
+              name: siflowai
+          volumes:
+          - name: deep-learning-examples
+            hostPath:
+              path: /data/wangza/deep_learning_examples
+          - name: siflowai
+            hostPath:
+              path: /data/wangza/siflowai
+          - name: dev-shm
+            hostPath:
+              path: /dev/shm
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: scitix.ai/gpu-type
+                        operator: In
+                        values:
+                          - h100nvlink80
+    Worker:
+      replicas: 3
+      restartPolicy: Never
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+          labels:
+            scitix.ai/topo-aware-in-node: "true"
+        spec:
+          <<: *job-spec

From 12af80984da6092a850c2b4b193dd84a5481097d Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Thu, 5 Jun 2025 11:45:42 +0000
Subject: [PATCH 4/6] official version

---
 .../k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
index f9c8295..0ffc2fe 100644
--- a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
+++ b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
@@ -22,8 +22,6 @@ spec:
           containers:
           - args:
             - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \
-               export SIFLOW_LOG_LEVEL=error && export SIFILE_FS_READ_TYPE=syncio && \
-               pip install /data/zawang/siflowai/siflowai-0.1.0-cp312-cp312-linux_x86_64.whl && \
                DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \
                BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \
                RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \

From 84f96fa403f0864b29f6030a57d8f37e6c04fa84 Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Thu, 5 Jun 2025 12:22:58 +0000
Subject: [PATCH 5/6] Standardized naming

---
 .../{meg-lm-llama2-70b-xlliu.yaml => meg-lm-llama2-70b-ckpt.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename launcher_scripts/k8s/training/megatron-lm/llm/{meg-lm-llama2-70b-xlliu.yaml => meg-lm-llama2-70b-ckpt.yaml} (100%)

diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml
similarity index 100%
rename from launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml
rename to launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml

From e37c4a82d6161c5b36464eef1130e50e3640d05a Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Fri, 6 Jun 2025 02:08:12 +0000
Subject: [PATCH 6/6] default enable async save

---
 training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh | 1 +
 training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh
index 654ca6b..8cc87a5 100755
--- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh
+++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh
@@ -106,6 +106,7 @@ MODEL_ARGS=(
     --swiglu
     --normalization RMSNorm 
     --disable-bias-linear
+    --async-save
 )
 
 TRAINING_ARGS=(
diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh
index a626209..8015d01 100755
--- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh
+++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh
@@ -109,6 +109,7 @@ MODEL_ARGS=(
     --swiglu
     --normalization RMSNorm 
     --disable-bias-linear
+    --async-save
 )
 
 TRAINING_ARGS=(