diff --git a/.github/workflows/cluster-tests.yml b/.github/workflows/cluster-tests.yml new file mode 100644 index 0000000..544cd5b --- /dev/null +++ b/.github/workflows/cluster-tests.yml @@ -0,0 +1,45 @@ +# Workflow to test latest climate-aware-task-scheduler release against ephemeral Slurm cluster +name: cluster-tests + +on: + push: + branches: [ main ] + paths: + - '.github/workflows/cluster-tests.yml' + - 'cluster/**' + - 'cats/**.py' + - 'pyproject.toml' + - '!cluster/README.md' + pull_request: + branches: [ main ] + paths: + - '.github/workflows/cluster-tests.yml' + - 'cluster/**' + - 'cats/**.py' + - 'pyproject.toml' + - '!cluster/README.md' + workflow_dispatch: + +jobs: + build: + + runs-on: ubuntu-latest + permissions: + contents: read + packages: read + steps: + - uses: actions/checkout@v4 + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Start slurm cluster and install CATS + run: | + ./cluster/start.sh + ./cluster/install_cats.sh + - name: Run tests + run: | + sleep 30 # wait for cluster to come up + ./cluster/tests.sh diff --git a/cluster/README.md b/cluster/README.md new file mode 100644 index 0000000..d1d119d --- /dev/null +++ b/cluster/README.md @@ -0,0 +1,46 @@ +# Cluster tests + +This folder contains scripts to setup an ephemeral SLURM cluster to test +cats in a more realistic setting than the current integration tests that +use mocking. The setup builds upon work from upstream +https://github.com/giovtorres/slurm-docker-cluster with a patched +Dockerfile that installs jq and uv to make CATS installation easier. Our +patches are maintained at +https://github.com/GreenScheduler/slurm-docker-cluster. + +## Pre-requisites + +Currently slurm-docker-cluster is only built against linux/amd64 so you +will need to be on a 64-bit machine if you want to test this locally. You +will also need docker installed. + +## Setup + +Clone this repository (GreenScheduler/cats) and then run + +```shell +./cluster/start.sh +``` +to fetch the `ghcr.io/greenscheduler/slurm-docker-cluster:latest` image +and start the cluster. You can now install cats locally from the current checkout: +```shell +./cluster/install_cats.sh +``` + +Once the cluster is built and running, then you can run the following to get +access to the control node: + +```shell +docker exec -it slurmctld bash +``` + +## Tests + +An automated testing script is supplied which shows programmatic interaction +with the slurm cluster. Currently cats schedules a short job, and the slurm +`scontrol` output is checked to see that the job was scheduled correctly. To +run the test: + +```shell +./cluster/tests.sh +``` diff --git a/cluster/cleanup.sh b/cluster/cleanup.sh new file mode 100755 index 0000000..7f36631 --- /dev/null +++ b/cluster/cleanup.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Cleans up resources and shuts down containers, useful for local development of slurm-docker-cluster +set -eou pipefail + +pushd cluster +docker compose down +popd diff --git a/cluster/docker-compose.yml b/cluster/docker-compose.yml new file mode 100644 index 0000000..5074f17 --- /dev/null +++ b/cluster/docker-compose.yml @@ -0,0 +1,92 @@ +services: + mysql: + image: mariadb:10.11 + hostname: mysql + container_name: mysql + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: slurm_acct_db + MYSQL_USER: slurm + MYSQL_PASSWORD: password + volumes: + - var_lib_mysql:/var/lib/mysql + networks: + - slurm-network + + slurmdbd: + image: ghcr.io/greenscheduler/slurm-docker-cluster:latest + command: ["slurmdbd"] + container_name: slurmdbd + hostname: slurmdbd + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - var_log_slurm:/var/log/slurm + expose: + - "6819" + depends_on: + - mysql + networks: + - slurm-network + + slurmctld: + image: ghcr.io/greenscheduler/slurm-docker-cluster:latest + command: ["slurmctld"] + container_name: slurmctld + hostname: slurmctld + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + expose: + - "6817" + depends_on: + - "slurmdbd" + networks: + - slurm-network + + c1: + image: ghcr.io/greenscheduler/slurm-docker-cluster:latest + command: ["slurmd"] + hostname: c1 + container_name: c1 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + - slurm-network + + c2: + image: ghcr.io/greenscheduler/slurm-docker-cluster:latest + command: ["slurmd"] + hostname: c2 + container_name: c2 + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - slurm_jobdir:/data + - var_log_slurm:/var/log/slurm + expose: + - "6818" + depends_on: + - "slurmctld" + networks: + - slurm-network + +volumes: + etc_munge: + etc_slurm: + slurm_jobdir: + var_lib_mysql: + var_log_slurm: + +networks: + slurm-network: + driver: bridge diff --git a/cluster/install_cats.sh b/cluster/install_cats.sh new file mode 100755 index 0000000..c094ee6 --- /dev/null +++ b/cluster/install_cats.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Install cats on the slurm custer +# This relies on a cluster already setup and running, if not run +# ./cluster/start.sh +set -eou pipefail + +docker exec slurmctld mkdir /tmp/cats +for file in pyproject.toml ./cats; do + docker cp "$file" slurmctld:/tmp/cats +done +docker exec slurmctld uv tool install /tmp/cats +docker exec slurmctld cp /root/.local/bin/cats /usr/local/bin/cats diff --git a/cluster/start.sh b/cluster/start.sh new file mode 100755 index 0000000..978ff95 --- /dev/null +++ b/cluster/start.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Starts cluster +set -eou pipefail +pushd cluster +docker compose pull +docker compose up -d +popd diff --git a/cluster/tests.sh b/cluster/tests.sh new file mode 100755 index 0000000..8019cef --- /dev/null +++ b/cluster/tests.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Run tests to check if slurm picks up begin time set by CATS +# This relies on a cluster already setup and running, if not run +# ./cluster/start.sh +set -eou pipefail + +# Step a) Run cats inside the slurmctld container and extract start time +raw_output=$(docker exec -i slurmctld \ + cats -d 5 --loc RG1 --scheduler=sbatch --command='ls' --format=json | \ + awk 'BEGIN{found=0} { + if(!found){ + i=index($0,"{"); + if(i){ print substr($0,i); found=1 } + } else { print } + }') +job_id=$(echo "$raw_output" | grep ^Submitted | awk '{print $4}') +echo "Detected job submission ID: $job_id" +raw_json=$(echo "$raw_output" | grep -v ^Submitted) +raw_start=$(printf '%s\n' "$raw_json" | jq -r '.carbonIntensityOptimal.start') + +# Replace seconds with 00 (truncate last 6 chars and add "00") +# Example: 2025-08-28T12:43:30.156434+00:00 -> 2025-08-28T12:43:00 +scheduled_start=$(echo "$raw_start" | sed -E 's/:[0-9]{2}\..*/:00/') + +echo "Expected scheduled start time: $scheduled_start" + +# Step b) Fetch job details +job_output=$(docker exec -i slurmctld scontrol show job "$job_id") + +# Check condition 1: job is pending for BeginTime +if (! echo "$job_output" | grep -q "JobState=PENDING Reason=BeginTime Dependency=(null)") && \ + (! echo "$job_output" | grep -q "JobState=RUNNING Reason=None"); then + echo "❌ Job state/Reason is not correct, expected one of PENDING/BeginTime or RUNNING/None" + echo "$job_output" + exit 1 +fi + +# Check condition 2: start time matches +if ! echo "$job_output" | grep -q "StartTime=$scheduled_start"; then + echo "❌ Start time does not match expected!" + echo "Expected: StartTime=$scheduled_start" + echo "Actual output:" + echo "$job_output" + exit 1 +fi + +echo "✅ Job is correctly delayed until $scheduled_start"