diff --git a/modules/python/clusterloader2/job_controller/config/ray/config.yaml b/modules/python/clusterloader2/job_controller/config/ray/config.yaml index 6feeb53258..5622bbd038 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/config.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/config.yaml @@ -47,7 +47,7 @@ steps: Params: action: start labelSelector: app.kubernetes.io/created-by = kuberay-operator - measurmentInterval: 1s + measurementInterval: 1s - name: Creating RayJobs for PyTorch MNIST fine-tuning phases: diff --git a/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml b/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml index da4315c1e2..ce55918ad2 100644 --- a/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml +++ b/modules/python/clusterloader2/job_controller/config/ray/job_template.yaml @@ -23,7 +23,7 @@ spec: effect: "NoSchedule" containers: - name: ray-submitter - image: rayproject/ray:2.41.0 + image: {{.Image}} rayClusterSpec: rayVersion: '2.9.3' headGroupSpec: @@ -38,7 +38,7 @@ spec: effect: "NoSchedule" containers: - name: ray-head - image: rayproject/ray:2.41.0 + image: {{.Image}} ports: - containerPort: 6379 name: gcs-server @@ -65,10 +65,10 @@ spec: effect: "NoSchedule" containers: - name: ray-worker - image: rayproject/ray:2.41.0 + image: {{.Image}} resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu: {{.JobGPU}} requests: - nvidia.com/gpu: 1 + nvidia.com/gpu: {{.JobGPU}} diff --git a/modules/python/clusterloader2/job_controller/job_controller.py b/modules/python/clusterloader2/job_controller/job_controller.py index 40b4b77de9..de84200a81 100644 --- a/modules/python/clusterloader2/job_controller/job_controller.py +++ b/modules/python/clusterloader2/job_controller/job_controller.py @@ -152,7 +152,8 @@ def install_ray_dependencies(self): from "/ray/". - Waits for operator and mock-head pods to be ready. """ - config_dir = os.path.join("./clusterloader2/job_controller/config", "ray") + logger.info("cl2 config dir: %s", self.cl2_config_dir) + config_dir = os.path.join(self.cl2_config_dir, "ray") values_file = os.path.join(config_dir, "values.yaml") # Install KubeRay operator via Helm @@ -180,6 +181,8 @@ def install_ray_dependencies(self): "--install", "kuberay-operator", "kuberay/kuberay-operator", + "--version", + "1.4.2", "--namespace", "kuberay-system", "--create-namespace", @@ -283,6 +286,9 @@ def add_configure_subparser_arguments(parser): type=str, help="Timeout before failing the scale up test", ) + parser.add_argument( + "--cl2_config_dir", type=str, help="Path to the CL2 config directory" + ) parser.add_argument( "--cl2_override_file", type=str, diff --git a/steps/engine/clusterloader2/job_controller/execute.yml b/steps/engine/clusterloader2/job_controller/execute.yml index f8d9355197..49355d7249 100644 --- a/steps/engine/clusterloader2/job_controller/execute.yml +++ b/steps/engine/clusterloader2/job_controller/execute.yml @@ -21,6 +21,7 @@ steps: --dra_enabled ${ENABLE_DRA:-False} \ --ray_enabled ${ENABLE_RAY:-False} \ --job_gpu ${JOB_GPU:-0} \ + --cl2_config_dir ${CL2_CONFIG_DIR} \ --cl2_override_file ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \