Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions .github/workflows/generate_reference_plots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,11 @@ jobs:
echo $SACCT_LOG
exit 1
fi

.github/workflows/make_sbatch_job.sh run_testpackage_generate_verf_set $DATAPATH
rm $DATAPATH/.lockfile
- name: Comparing plotted data
run: |
export TMPDIR=$RUNNER_TEMP
module load Python/3.10.4-GCCcore-11.3.0
. CI_env/bin/activate
sbatch -W -o "testpackage_compare.txt" ./testpackage/run_compare.sh verf_set > jobid1.txt
cat testpackage_compare.txt

export JOBID=$(grep -Po '\d+' jobid1.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi
.github/workflows/make_sbatch_job.sh run_compare verf_set
33 changes: 33 additions & 0 deletions .github/workflows/make_sbatch_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
if [[ ! $ARRAY_SIZE ]]; then
ARRAY_SIZE=14
fi
# cmd argument $1 is used for the file name of the script to run, log file, jobid

#if sbatch fails as it gets the error code from python it tries to print the log
# first with srun (in case the file has not yet updated on the front end)
# second if srun fails (in case communication failure) it tries to cat it on the frontend (may be empty if file has not been updated pyproject)

sbatch -W --array=1-$ARRAY_SIZE -o "$1" ./testpackage/$1.sh $2 > jobid_$1 || srun cat $1 || cat $1 || echo "cat failed exit code $?"

#in case we do exit 0 successfully
LOG=$(srun cat $1 || cat $1)
if [[ ! $LOG ]]; then
echo "::warning::Log file could not be read: srun failed and cat returned and empty logfile. Exit code $?"
exit 1
fi
echo "$LOG"
#It is possible that the sbatch command above returns exit 0 if only for example 1 of the array jobs failed but not all, in such a case we check sacct
# It is also possible that the node never ran it and silently failed which is visible on sacct
#Additionally note that JOBID is saved by the run scripts HOWEVER if a nodes silently fails it may not get passed to the github output
JOBID=$(srun grep -Po '\d+' jobid_$1 || grep -Po '\d+' jobid_$1)
if [[ $? -ne 0 ]] || [[ ! $JOBID ]]; then
echo "::error::Jobid could not be found, exiting. Exit code $?"
exit 1
fi
SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "::error::Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi
22 changes: 4 additions & 18 deletions .github/workflows/test_compare_image_oldest_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,9 @@ jobs:
echo -e ".lockfile found in $verf_loc/$verfset, not running test, as the verification set generation is likely still ongoing\n Check ongoing actions and/or re-run verification set generation."
exit 1
fi
sbatch -W -o testpackage_run.txt --job-name gen_plots ./testpackage/run_testpackage_workflow.sh old_python ${{ steps.pyversion.outputs.PYTHON }} > jobid.txt
cat testpackage_run.txt
export JOBID=$(grep -Po '\d+' jobid.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi

.github/workflows/make_sbatch_job.sh run_testpackage_workflow old_python ${{ steps.pyversion.outputs.PYTHON }}


- name: Comparing plotted data
run: |
Expand All @@ -92,15 +86,7 @@ jobs:
module load ${{ steps.pyversion.outputs.PYTHON }}
module list
. CI_env/bin/activate
sbatch -W -o "testpackage_compare.txt" ./testpackage/run_compare.sh old_python ${{ steps.pyversion.outputs.PYTHON }} > jobid1.txt
cat testpackage_compare.txt
export JOBID=$(grep -Po '\d+' jobid1.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi
.github/workflows/make_sbatch_job.sh run_compare old_python ${{ steps.pyversion.outputs.PYTHON }}

- name: scancel dangling job upon cancellation
if: cancelled()
Expand Down
25 changes: 5 additions & 20 deletions .github/workflows/test_compare_images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ jobs:
cat diff_log.txt
fi
echo "DIFFRESULT=$DIFFRESULT" >> $GITHUB_OUTPUT
echo "Running $DIFFRESULT"
MAX_ARRAY=14
TOTAL_TESTS=$(ls ./testpackage/testpackage_definitions/ | wc -l)
if [[ $DIFFRESULT && $DIFFRESULT != "pass" ]]; then
Expand All @@ -67,32 +66,18 @@ jobs:
else
ARRAY_SIZE=$MAX_ARRAY
fi
sbatch -W -o testpackage_run.txt --array=1-$ARRAY_SIZE --job-name gen_plots ./testpackage/run_testpackage_workflow.sh $DIFFRESULT > jobid.txt
cat testpackage_run.txt
export JOBID=$(grep -Po '\d+' jobid.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi
export ARRAY_SIZE=$ARRAY_SIZE
echo "Running $DIFFRESULT, ARRAY_SIZE $ARRAY_SIZE"
.github/workflows/make_sbatch_job.sh run_testpackage_workflow $DIFFRESULT


- name: Comparing plotted data
run: |
export TMPDIR=$RUNNER_TEMP
module load Python/3.10.4-GCCcore-11.3.0
. CI_env/bin/activate
sbatch -W -o "testpackage_compare.txt" ./testpackage/run_compare.sh ${{ steps.run_cl.outputs.DIFFRESULT }} > jobid1.txt
cat testpackage_compare.txt
export JOBID=$(grep -Po '\d+' jobid1.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi
.github/workflows/make_sbatch_job.sh run_compare ${{ steps.run_cl.outputs.DIFFRESULT }}


- name: scancel dangling job upon cancellation
if: cancelled()
run: |
Expand Down
24 changes: 6 additions & 18 deletions .github/workflows/test_compare_images_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,18 @@ jobs:
echo -e ".lockfile found in $verf_loc/$verfset, not running test, as the verification set generation is likely still ongoing\n Check ongoing actions and/or re-run verification set generation."
exit 1
fi
sbatch -W -o testpackage_run.txt --job-name gen_plots ./testpackage/run_testpackage_workflow.sh > jobid.txt
cat testpackage_run.txt
export JOBID=$(grep -Po '\d+' jobid.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi

.github/workflows/make_sbatch_job.sh run_testpackage_workflow


- name: Comparing plotted data
run: |
export TMPDIR=$RUNNER_TEMP
module load Python/3.10.4-GCCcore-11.3.0
. CI_env/bin/activate
sbatch -W -o "testpackage_compare.txt" ./testpackage/run_compare.sh > jobid1.txt
cat testpackage_compare.txt
export JOBID=$(grep -Po '\d+' jobid1.txt)
export SACCT_LOG=$(sacct -j $JOBID -o job,state,node | grep FAILED)
if [[ $SACCT_LOG ]]; then
echo "Some job failed on a node, try to take a look at the slurm log."
echo "$SACCT_LOG"
exit 1
fi

.github/workflows/make_sbatch_job.sh run_compare



- name: scancel dangling job upon cancellation
Expand Down
2 changes: 1 addition & 1 deletion testpackage/run_compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ if [[ $@ == 'verf_set' ]]; then
elif [[ $1 == 'old_python' ]]; then
check=false
#Do selective compare if other arguments
elif [ $@ ]; then
elif [[ $@ ]]; then
check=true
fi

Expand Down