skinniderlab · anushka255 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 30, 2024
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-extend-ignore = E203,E501
+extend-ignore = E203,E501,W605
 max-complexity = 99
 max-line-length = 88
 exclude = .git,__pycache__,build,dist
diff --git a/snakemake/Snakefile_model_eval b/snakemake/Snakefile_model_eval
@@ -26,7 +26,7 @@ rule all:
         calculate_outcomes_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_calculate_outcomes.csv",
             fold=range(FOLDS),repr=REPRESENTATIONS,dataset=DATASET,enum_factor=ENUM_FACTORS),
         nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
-            repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
+           repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         train_discriminator=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_train_discriminator_.csv",
             repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         freq_distribution=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_freq_distribution.csv",

diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
@@ -0,0 +1,25 @@
+---
+# basic configuration
+use-conda: true
+conda-frontend: conda
+printshellcmds: true
+
+# cluster specific settings
+cluster:
+  sbatch
+    --cpus-per-task={threads}
+    --mem={resources.mem_mb}M
+    --time={resources.runtime} --output=slurm_out/%x-%A
+    --job-name={rule} --parsable
+    --partition={resources.partition}
+cluster-status: "slurm-status.py"
+cluster-cancel: scancel
+cluster-cancel-nargs: 50
+latency-wait: 120  # wait 2 minutes for missing files before raising exception
+# important for NFS
+jobs: 250  # maximum jobs to run at once
+max-jobs-per-second: 1
+max-status-checks-per-second: 10
+local-cores: 4  # maximum local jobs to run
+default-resources:
+  partition=main,hoppertest,skinniderlab
diff --git a/snakemake/profiles/slurm-status.py b/snakemake/profiles/slurm-status.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import re
+import subprocess as sp
+import shlex
+import sys
+import time
+import logging
+
+logger = logging.getLogger("__name__")
+
+STATUS_ATTEMPTS = 20
+
+jobid = sys.argv[1]
+
+for i in range(STATUS_ATTEMPTS):
+    try:
+        sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
+        res = {
+            x.split("|")[0]: x.split("|")[1]
+            for x in sacct_res.decode().strip().split("\n")
+        }
+        break
+    except sp.CalledProcessError as e:
+        logger.error("sacct process error")
+        logger.error(e)
+    except IndexError:
+        pass
+    # Try getting job with scontrol instead in case sacct is misconfigured
+    try:
+        sctrl_res = sp.check_output(
+            shlex.split("scontrol -o show job {}".format(jobid))
+        )
+        m = re.search("JobState=(\w+)", sctrl_res.decode())
+        res = {jobid: m.group(1)}
+        break
+    except sp.CalledProcessError as e:
+        logger.error("scontrol process error")
+        logger.error(e)
+        if i >= STATUS_ATTEMPTS - 1:
+            print("failed")
+            exit(0)
+        else:
+            time.sleep(1)
+
+status = res[jobid]
+
+if status == "BOOT_FAIL":
+    print("failed")
+elif status == "OUT_OF_MEMORY":
+    print("failed")
+elif status.startswith("CANCELLED"):
+    print("failed")
+elif status == "COMPLETED":
+    print("success")
+elif status == "DEADLINE":
+    print("failed")
+elif status == "FAILED":
+    print("failed")
+elif status == "NODE_FAIL":
+    print("failed")
+elif status == "PREEMPTED":
+    print("failed")
+elif status == "TIMEOUT":
+    print("failed")
+# Unclear whether SUSPENDED should be treated as running or failed
+elif status == "SUSPENDED":
+    print("failed")
+else:
+    print("running")