From a2a7195a155f9cd0cdb810362f3add2ecef487f9 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 09:23:53 -0500 Subject: [PATCH 1/6] fix typo --- .../python/clusterloader2/job_controller/config/ray/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/job_controller/config/ray/config.yaml b/modules/python/clusterloader2/job_controller/config/ray/config.yaml index 6feeb53258..5622bbd038 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/config.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/config.yaml @@ -47,7 +47,7 @@ steps: Params: action: start labelSelector: app.kubernetes.io/created-by = kuberay-operator - measurmentInterval: 1s + measurementInterval: 1s - name: Creating RayJobs for PyTorch MNIST fine-tuning phases: From 19472ab7a70fa4a2c855be5960cbbb7511b72070 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 10:04:48 -0500 Subject: [PATCH 2/6] fix job template --- .../job_controller/config/ray/job_template.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml b/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml index da4315c1e2..ce55918ad2 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml @@ -23,7 +23,7 @@ spec: effect: "NoSchedule" containers: - name: ray-submitter - image: rayproject/ray:2.41.0 + image: {{.Image}} rayClusterSpec: rayVersion: '2.9.3' headGroupSpec: @@ -38,7 +38,7 @@ spec: effect: "NoSchedule" containers: - name: ray-head - image: rayproject/ray:2.41.0 + image: {{.Image}} ports: - containerPort: 6379 name: gcs-server @@ -65,10 +65,10 @@ spec: effect: "NoSchedule" containers: - name: ray-worker - image: rayproject/ray:2.41.0 + image: {{.Image}} resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu: {{.JobGPU}} requests: - nvidia.com/gpu: 1 + nvidia.com/gpu: {{.JobGPU}} From 4fb03a30916730c8b71eb96468226e81900b3475 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 11:42:09 -0500 Subject: [PATCH 3/6] fix job template path --- .../python/clusterloader2/job_controller/config/ray/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/job_controller/config/ray/config.yaml b/modules/python/clusterloader2/job_controller/config/ray/config.yaml index 5622bbd038..0114121fe2 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/config.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/config.yaml @@ -58,7 +58,7 @@ steps: tuningSet: Uniform{{$qps}}qps objectBundle: - basename: test - objectTemplatePath: job_template.yaml + objectTemplatePath: {{$job_template_path}} templateFillMap: Group: ray-scheduling Image: "rayproject/ray:2.46.0" # replace with image built from images/ray-pytorch From 775026f144b8ac81579fa865c26c09044b27b9c6 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 18:46:50 +0000 Subject: [PATCH 4/6] fix chart version --- modules/python/clusterloader2/job_controller/job_controller.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/python/clusterloader2/job_controller/job_controller.py b/modules/python/clusterloader2/job_controller/job_controller.py index 40b4b77de9..1d8d01b649 100644 --- a/modules/python/clusterloader2/job_controller/job_controller.py +++ b/modules/python/clusterloader2/job_controller/job_controller.py @@ -180,6 +180,8 @@ def install_ray_dependencies(self): "--install", "kuberay-operator", "kuberay/kuberay-operator", + "--version", + "1.4.2", "--namespace", "kuberay-system", "--create-namespace", From 22c8425c3fdceaf13d6ac18863aebc2c5bf852c3 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 18:50:29 +0000 Subject: [PATCH 5/6] fix cl2 config dir --- .../python/clusterloader2/job_controller/job_controller.py | 6 +++++- steps/engine/clusterloader2/job_controller/execute.yml | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/job_controller/job_controller.py b/modules/python/clusterloader2/job_controller/job_controller.py index 1d8d01b649..de84200a81 100644 --- a/modules/python/clusterloader2/job_controller/job_controller.py +++ b/modules/python/clusterloader2/job_controller/job_controller.py @@ -152,7 +152,8 @@ def install_ray_dependencies(self): from "/ray/". - Waits for operator and mock-head pods to be ready. """ - config_dir = os.path.join("./clusterloader2/job_controller/config", "ray") + logger.info("cl2 config dir: %s", self.cl2_config_dir) + config_dir = os.path.join(self.cl2_config_dir, "ray") values_file = os.path.join(config_dir, "values.yaml") # Install KubeRay operator via Helm @@ -285,6 +286,9 @@ def add_configure_subparser_arguments(parser): type=str, help="Timeout before failing the scale up test", ) + parser.add_argument( + "--cl2_config_dir", type=str, help="Path to the CL2 config directory" + ) parser.add_argument( "--cl2_override_file", type=str, diff --git a/steps/engine/clusterloader2/job_controller/execute.yml b/steps/engine/clusterloader2/job_controller/execute.yml index f8d9355197..49355d7249 100644 --- a/steps/engine/clusterloader2/job_controller/execute.yml +++ b/steps/engine/clusterloader2/job_controller/execute.yml @@ -21,6 +21,7 @@ steps: --dra_enabled ${ENABLE_DRA:-False} \ --ray_enabled ${ENABLE_RAY:-False} \ --job_gpu ${JOB_GPU:-0} \ + --cl2_config_dir ${CL2_CONFIG_DIR} \ --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ From 80f8f6cf20e4e7be50f80b80261c1bab6127977b Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 14 Jan 2026 20:17:57 +0000 Subject: [PATCH 6/6] update job template path --- .../python/clusterloader2/job_controller/config/ray/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/job_controller/config/ray/config.yaml b/modules/python/clusterloader2/job_controller/config/ray/config.yaml index 0114121fe2..5622bbd038 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/config.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/config.yaml @@ -58,7 +58,7 @@ steps: tuningSet: Uniform{{$qps}}qps objectBundle: - basename: test - objectTemplatePath: {{$job_template_path}} + objectTemplatePath: job_template.yaml templateFillMap: Group: ray-scheduling Image: "rayproject/ray:2.46.0" # replace with image built from images/ray-pytorch