Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
extend-ignore = E203,E501
extend-ignore = E203,E501,W605
max-complexity = 99
max-line-length = 88
exclude = .git,__pycache__,build,dist
2 changes: 1 addition & 1 deletion snakemake/Snakefile_model_eval
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ rule all:
calculate_outcomes_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_calculate_outcomes.csv",
fold=range(FOLDS),repr=REPRESENTATIONS,dataset=DATASET,enum_factor=ENUM_FACTORS),
nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
train_discriminator=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_train_discriminator_.csv",
repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
freq_distribution=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_freq_distribution.csv",
Expand Down
25 changes: 25 additions & 0 deletions snakemake/profiles/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
# basic configuration
use-conda: true
conda-frontend: conda
printshellcmds: true

# cluster specific settings
cluster:
sbatch
--cpus-per-task={threads}
--mem={resources.mem_mb}M
--time={resources.runtime} --output=slurm_out/%x-%A
--job-name={rule} --parsable
--partition={resources.partition}
cluster-status: "slurm-status.py"
cluster-cancel: scancel
cluster-cancel-nargs: 50
latency-wait: 120 # wait 2 minutes for missing files before raising exception
# important for NFS
jobs: 250 # maximum jobs to run at once
max-jobs-per-second: 1
max-status-checks-per-second: 10
local-cores: 4 # maximum local jobs to run
default-resources:
partition=main,hoppertest,skinniderlab
69 changes: 69 additions & 0 deletions snakemake/profiles/slurm-status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
import re
import subprocess as sp
import shlex
import sys
import time
import logging

logger = logging.getLogger("__name__")

STATUS_ATTEMPTS = 20

jobid = sys.argv[1]

for i in range(STATUS_ATTEMPTS):
try:
sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
res = {
x.split("|")[0]: x.split("|")[1]
for x in sacct_res.decode().strip().split("\n")
}
break
except sp.CalledProcessError as e:
logger.error("sacct process error")
logger.error(e)
except IndexError:
pass
# Try getting job with scontrol instead in case sacct is misconfigured
try:
sctrl_res = sp.check_output(
shlex.split("scontrol -o show job {}".format(jobid))
)
m = re.search("JobState=(\w+)", sctrl_res.decode())
res = {jobid: m.group(1)}
break
except sp.CalledProcessError as e:
logger.error("scontrol process error")
logger.error(e)
if i >= STATUS_ATTEMPTS - 1:
print("failed")
exit(0)
else:
time.sleep(1)

status = res[jobid]

if status == "BOOT_FAIL":
print("failed")
elif status == "OUT_OF_MEMORY":
print("failed")
elif status.startswith("CANCELLED"):
print("failed")
elif status == "COMPLETED":
print("success")
elif status == "DEADLINE":
print("failed")
elif status == "FAILED":
print("failed")
elif status == "NODE_FAIL":
print("failed")
elif status == "PREEMPTED":
print("failed")
elif status == "TIMEOUT":
print("failed")
# Unclear whether SUSPENDED should be treated as running or failed
elif status == "SUSPENDED":
print("failed")
else:
print("running")