From 308ca073741e6cb3c655b0398fd7ac6acbfdbefd Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Thu, 5 Jun 2025 09:55:35 +0000 Subject: [PATCH 1/6] update Megatron DeepSpeed --- thirdparty/Megatron-DeepSpeed | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/Megatron-DeepSpeed b/thirdparty/Megatron-DeepSpeed index 0d6e379..3e1da1f 160000 --- a/thirdparty/Megatron-DeepSpeed +++ b/thirdparty/Megatron-DeepSpeed @@ -1 +1 @@ -Subproject commit 0d6e3793a1fc06eded9764ef15ad12bcc0281101 +Subproject commit 3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26 From 0222d1470e542ae1e047419db38a6f73348e7bd8 Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Thu, 5 Jun 2025 09:56:29 +0000 Subject: [PATCH 2/6] update submodule Megatron-LM --- thirdparty/Megatron-LM | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/Megatron-LM b/thirdparty/Megatron-LM index 77b4bfe..957f348 160000 --- a/thirdparty/Megatron-LM +++ b/thirdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 77b4bfe00ab2634650345fd485be59a9d9c27272 +Subproject commit 957f3488efd505e5d22f5d5bc46eaa187eeb44cb From 3067d264b060c79e76b0d63e329890d3b2b91e7b Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Thu, 5 Jun 2025 09:59:15 +0000 Subject: [PATCH 3/6] update launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml --- .../llm/meg-lm-llama2-70b-xlliu.yaml | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml new file mode 100644 index 0000000..f9c8295 --- /dev/null +++ b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml @@ -0,0 +1,101 @@ +apiVersion: kubeflow.org/v1 +kind: PyTorchJob +metadata: + name: meg-lm-llama2-70b-alan + namespace: default +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: Never + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + scitix.ai/topo-aware-in-node: "true" + spec: &job-spec + tolerations: + - key: "scitix.ai/nodecheck" + operator: "Exists" + effect: "NoSchedule" + containers: + - args: + - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \ + export SIFLOW_LOG_LEVEL=error && export SIFILE_FS_READ_TYPE=syncio && \ + pip install /data/zawang/siflowai/siflowai-0.1.0-cp312-cp312-linux_x86_64.whl && \ + DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \ + BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \ + RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \ + ENABLE_CKPT=1 MOCK_DATA=true \ + bash /workspace/deep_learning_examples/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh " + command: + - /usr/bin/env + - bash + - -c + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: TZ + value: CST-8 + image: registry-ap-southeast.scitix.ai/hpc/nemo:25.04 + imagePullPolicy: Always + name: pytorch + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + rdma/hca_shared_devices_all: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + rdma/hca_shared_devices_all: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dev-shm + - mountPath: /workspace/deep_learning_examples + name: deep-learning-examples + - mountPath: /data/zawang/siflowai + name: siflowai + volumes: + - name: deep-learning-examples + hostPath: + path: /data/wangza/deep_learning_examples + - name: siflowai + hostPath: + path: /data/wangza/siflowai + - name: dev-shm + hostPath: + path: /dev/shm + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: scitix.ai/gpu-type + operator: In + values: + - h100nvlink80 + Worker: + replicas: 3 + restartPolicy: Never + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + scitix.ai/topo-aware-in-node: "true" + spec: + <<: *job-spec From 12af80984da6092a850c2b4b193dd84a5481097d Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Thu, 5 Jun 2025 11:45:42 +0000 Subject: [PATCH 4/6] official version --- .../k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml index f9c8295..0ffc2fe 100644 --- a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml +++ b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml @@ -22,8 +22,6 @@ spec: containers: - args: - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \ - export SIFLOW_LOG_LEVEL=error && export SIFILE_FS_READ_TYPE=syncio && \ - pip install /data/zawang/siflowai/siflowai-0.1.0-cp312-cp312-linux_x86_64.whl && \ DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \ BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \ RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \ From 84f96fa403f0864b29f6030a57d8f37e6c04fa84 Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Thu, 5 Jun 2025 12:22:58 +0000 Subject: [PATCH 5/6] Standardized naming --- .../{meg-lm-llama2-70b-xlliu.yaml => meg-lm-llama2-70b-ckpt.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename launcher_scripts/k8s/training/megatron-lm/llm/{meg-lm-llama2-70b-xlliu.yaml => meg-lm-llama2-70b-ckpt.yaml} (100%) diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml similarity index 100% rename from launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-xlliu.yaml rename to launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml From e37c4a82d6161c5b36464eef1130e50e3640d05a Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Fri, 6 Jun 2025 02:08:12 +0000 Subject: [PATCH 6/6] default enable async save --- training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh | 1 + training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh index 654ca6b..8cc87a5 100755 --- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh +++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh @@ -106,6 +106,7 @@ MODEL_ARGS=( --swiglu --normalization RMSNorm --disable-bias-linear + --async-save ) TRAINING_ARGS=( diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh index a626209..8015d01 100755 --- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh +++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh @@ -109,6 +109,7 @@ MODEL_ARGS=( --swiglu --normalization RMSNorm --disable-bias-linear + --async-save ) TRAINING_ARGS=(