diff --git a/.flake8 b/.flake8 index 6ad40a51..e84c7792 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -extend-ignore = E203,E501 +extend-ignore = E203,E501,W605 max-complexity = 99 max-line-length = 88 exclude = .git,__pycache__,build,dist diff --git a/snakemake/Snakefile_model_eval b/snakemake/Snakefile_model_eval index 2dec109e..881f80b7 100644 --- a/snakemake/Snakefile_model_eval +++ b/snakemake/Snakefile_model_eval @@ -26,7 +26,7 @@ rule all: calculate_outcomes_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_calculate_outcomes.csv", fold=range(FOLDS),repr=REPRESENTATIONS,dataset=DATASET,enum_factor=ENUM_FACTORS), nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv", - repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS), + repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS), train_discriminator=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_train_discriminator_.csv", repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS), freq_distribution=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_freq_distribution.csv", diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml new file mode 100644 index 00000000..f7b911b7 --- /dev/null +++ b/snakemake/profiles/config.yaml @@ -0,0 +1,25 @@ +--- +# basic configuration +use-conda: true +conda-frontend: conda +printshellcmds: true + +# cluster specific settings +cluster: + sbatch + --cpus-per-task={threads} + --mem={resources.mem_mb}M + --time={resources.runtime} --output=slurm_out/%x-%A + --job-name={rule} --parsable + --partition={resources.partition} +cluster-status: "slurm-status.py" +cluster-cancel: scancel +cluster-cancel-nargs: 50 +latency-wait: 120 # wait 2 minutes for missing files before raising exception +# important for NFS +jobs: 250 # maximum jobs to run at once +max-jobs-per-second: 1 +max-status-checks-per-second: 10 +local-cores: 4 # maximum local jobs to run +default-resources: + partition=main,hoppertest,skinniderlab diff --git a/snakemake/profiles/slurm-status.py b/snakemake/profiles/slurm-status.py new file mode 100755 index 00000000..8eb0cbc8 --- /dev/null +++ b/snakemake/profiles/slurm-status.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import re +import subprocess as sp +import shlex +import sys +import time +import logging + +logger = logging.getLogger("__name__") + +STATUS_ATTEMPTS = 20 + +jobid = sys.argv[1] + +for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid))) + res = { + x.split("|")[0]: x.split("|")[1] + for x in sacct_res.decode().strip().split("\n") + } + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError: + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output( + shlex.split("scontrol -o show job {}".format(jobid)) + ) + m = re.search("JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + exit(0) + else: + time.sleep(1) + +status = res[jobid] + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +# Unclear whether SUSPENDED should be treated as running or failed +elif status == "SUSPENDED": + print("failed") +else: + print("running")