From c0a966318da036028088798ff58608327f55e110 Mon Sep 17 00:00:00 2001
From: anushka255 <aa9078@princeton.edu>
Date: Wed, 24 Apr 2024 12:19:10 -0400
Subject: [PATCH 1/5] added custom snakemake profile

---
 snakemake/profiles/config.yaml     | 26 ++++++++++++
 snakemake/profiles/slurm-status.py | 63 ++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 snakemake/profiles/config.yaml
 create mode 100644 snakemake/profiles/slurm-status.py

diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
new file mode 100644
index 00000000..7239fbd8
--- /dev/null
+++ b/snakemake/profiles/config.yaml
@@ -0,0 +1,26 @@
+---
+# basic configuration
+use-conda: true
+conda-frontend: conda
+printshellcmds: true
+
+# cluster specific settings
+cluster:
+  makedir -p logs/{rules} &&
+  sbatch 
+    --cpus-per-task={threads} 
+    --mem={resources.mem}M 
+    --time={resources.time} --output=slurm_out/%x-%A 
+    --job-name={rule} --parsable
+    --partition={resources.partition}
+cluster-status: "slurm-status.py"
+cluster-cancel: scancel
+cluster-cancel-nargs: 50
+latency-wait: 120  # wait 2 minutes for missing files before raising exception
+# important for NFS
+jobs: 250  # maximum jobs to run at once
+max-jobs-per-second: 1
+max-status-checks-per-second: 10
+local-cores: 4  # maximum local jobs to run
+default-resources:
+  partition=main,hoppertest,skinniderlab
\ No newline at end of file
diff --git a/snakemake/profiles/slurm-status.py b/snakemake/profiles/slurm-status.py
new file mode 100644
index 00000000..831d4149
--- /dev/null
+++ b/snakemake/profiles/slurm-status.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import re
+import subprocess as sp
+import shlex
+import sys
+import time
+import logging
+logger = logging.getLogger("__name__")
+
+STATUS_ATTEMPTS = 20
+
+jobid = sys.argv[1]
+
+for i in range(STATUS_ATTEMPTS):
+    try:
+        sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
+        res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
+        break
+    except sp.CalledProcessError as e:
+        logger.error("sacct process error")
+        logger.error(e)
+    except IndexError as e:
+        pass
+    # Try getting job with scontrol instead in case sacct is misconfigured
+    try:
+        sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid)))
+        m = re.search("JobState=(\w+)", sctrl_res.decode())
+        res = {jobid: m.group(1)}
+        break
+    except sp.CalledProcessError as e:
+        logger.error("scontrol process error")
+        logger.error(e)
+        if i >= STATUS_ATTEMPTS - 1:
+            print("failed")
+            exit(0)
+        else:
+            time.sleep(1)
+
+status = res[jobid]
+
+if (status == "BOOT_FAIL"):
+    print("failed")
+elif (status == "OUT_OF_MEMORY"):
+    print("failed")
+elif (status.startswith("CANCELLED")):
+    print("failed")
+elif (status == "COMPLETED"):
+    print("success")
+elif (status == "DEADLINE"):
+    print("failed")
+elif (status == "FAILED"):
+    print("failed")
+elif (status == "NODE_FAIL"):
+    print("failed")
+elif (status == "PREEMPTED"):
+    print("failed")
+elif (status == "TIMEOUT"):
+    print("failed")
+# Unclear whether SUSPENDED should be treated as running or failed
+elif (status == "SUSPENDED"):
+    print("failed")
+else:
+    print("running")
\ No newline at end of file

From 75e1c3ea4a6d03f10f316712a412695dbd0c8919 Mon Sep 17 00:00:00 2001
From: Anushka Acharya <aa9078@princeton.edu>
Date: Wed, 24 Apr 2024 12:56:44 -0400
Subject: [PATCH 2/5] minor tweaks

---
 snakemake/Snakefile                 |  2 +-
 snakemake/Snakefile_model_eval      |  4 ++--
 snakemake/config.json               | 12 ++++++------
 snakemake/profiles/config.yaml      |  7 +++----
 snakemake/profiles/slurm-status.py  |  0
 src/clm/plot/train_discriminator.py |  2 +-
 6 files changed, 13 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 snakemake/profiles/slurm-status.py

diff --git a/snakemake/Snakefile b/snakemake/Snakefile
index e452a249..466105d0 100644
--- a/snakemake/Snakefile
+++ b/snakemake/Snakefile
@@ -195,7 +195,7 @@ rule sample_molecules_RNN:
     output:
         output_file=f"{OUTPUT_DIR}/{{enum_factor}}/prior/samples/{{dataset}}_{{repr}}_{{fold}}_{{train_seed}}_{{sample_seed}}_samples.csv"
     resources:
-        mem_mb=1000,
+        mem_mb=2000,
         runtime=15+MODEL_PARAMS["sample_mols"]//10000,
         slurm_extra="--gres=gpu:1"
     shell:
diff --git a/snakemake/Snakefile_model_eval b/snakemake/Snakefile_model_eval
index 97393fdd..10b4a1c8 100644
--- a/snakemake/Snakefile_model_eval
+++ b/snakemake/Snakefile_model_eval
@@ -25,8 +25,8 @@ rule all:
     input:
         calculate_outcomes_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_calculate_outcomes.csv",
             fold=range(FOLDS),repr=REPRESENTATIONS,dataset=DATASET,enum_factor=ENUM_FACTORS),
-        nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
-            repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
+        # nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
+          #  repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         train_discriminator=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_train_discriminator_.csv",
             repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         freq_distribution=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_freq_distribution.csv",
diff --git a/snakemake/config.json b/snakemake/config.json
index aa9f806c..7f13d609 100644
--- a/snakemake/config.json
+++ b/snakemake/config.json
@@ -1,13 +1,13 @@
 {
-  "output_dir": "data",
-  "dataset": "/path/to/<dataset>.txt",
-  "pubchem_tsv_file": "/path/to/PubChem.tsv",
+  "output_dir": "/Genomics/argo/users/aa9078/PED_1",
+  "dataset": "/Genomics/argo/users/aa9078/data/prior/raw/PED.csv",
+  "pubchem_tsv_file": "/Genomics/singhlab/vineetb/CLM/snakemake/data/PubChem_with_fps.tsv",
 
   "representations": ["SMILES"],
   "folds": 10,
-  "train_seeds": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+  "train_seeds": [0],
   "sample_seeds": [0],
-  "enum_factors": [0, 10, 30, 50, 100],
+  "enum_factors": [0, 1],
   "max_input_smiles": 0,
 
   "model_params": {
@@ -30,5 +30,5 @@
   "min_tc": 0,
   "top_k": 30,
   "err_ppm": 10,
-  "random_seed": null
+  "random_seed": 12345
 }
diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
index 7239fbd8..c85509a7 100644
--- a/snakemake/profiles/config.yaml
+++ b/snakemake/profiles/config.yaml
@@ -6,11 +6,10 @@ printshellcmds: true
 
 # cluster specific settings
 cluster:
-  makedir -p logs/{rules} &&
   sbatch 
     --cpus-per-task={threads} 
-    --mem={resources.mem}M 
-    --time={resources.time} --output=slurm_out/%x-%A 
+    --mem={resources.mem_mb}
+    --time={resources.runtime} --output=slurm_out/%x-%A 
     --job-name={rule} --parsable
     --partition={resources.partition}
 cluster-status: "slurm-status.py"
@@ -23,4 +22,4 @@ max-jobs-per-second: 1
 max-status-checks-per-second: 10
 local-cores: 4  # maximum local jobs to run
 default-resources:
-  partition=main,hoppertest,skinniderlab
\ No newline at end of file
+  partition=main,hoppertest,skinniderlab
diff --git a/snakemake/profiles/slurm-status.py b/snakemake/profiles/slurm-status.py
old mode 100644
new mode 100755
diff --git a/src/clm/plot/train_discriminator.py b/src/clm/plot/train_discriminator.py
index 32729308..2f99a954 100644
--- a/src/clm/plot/train_discriminator.py
+++ b/src/clm/plot/train_discriminator.py
@@ -86,7 +86,7 @@ def plot(outcome_dir, output_dir):
     # Make output directory if it doesn't exist yet
     os.makedirs(output_dir, exist_ok=True)
 
-    outcome_files = glob.glob(f"{outcome_dir}/*train_discriminator.csv")
+    outcome_files = glob.glob(f"{outcome_dir}/*train_discriminator_.csv")
     outcome = pd.concat(
         [pd.read_csv(outcome_file, delimiter=",") for outcome_file in outcome_files]
     )

From bdfc36dec3a1ec724df8598ce2b57ea87dc9c4c1 Mon Sep 17 00:00:00 2001
From: anushka255 <aa9078@princeton.edu>
Date: Wed, 24 Apr 2024 13:15:36 -0400
Subject: [PATCH 3/5] pre-commit checks

---
 .flake8                            |  2 +-
 snakemake/profiles/config.yaml     | 10 ++++-----
 snakemake/profiles/slurm-status.py | 34 ++++++++++++++++++------------
 3 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/.flake8 b/.flake8
index 6ad40a51..e84c7792 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-extend-ignore = E203,E501
+extend-ignore = E203,E501,W605
 max-complexity = 99
 max-line-length = 88
 exclude = .git,__pycache__,build,dist
diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
index 7239fbd8..c7a60e51 100644
--- a/snakemake/profiles/config.yaml
+++ b/snakemake/profiles/config.yaml
@@ -7,10 +7,10 @@ printshellcmds: true
 # cluster specific settings
 cluster:
   makedir -p logs/{rules} &&
-  sbatch 
-    --cpus-per-task={threads} 
-    --mem={resources.mem}M 
-    --time={resources.time} --output=slurm_out/%x-%A 
+  sbatch
+    --cpus-per-task={threads}
+    --mem={resources.mem}M
+    --time={resources.time} --output=slurm_out/%x-%A
     --job-name={rule} --parsable
     --partition={resources.partition}
 cluster-status: "slurm-status.py"
@@ -23,4 +23,4 @@ max-jobs-per-second: 1
 max-status-checks-per-second: 10
 local-cores: 4  # maximum local jobs to run
 default-resources:
-  partition=main,hoppertest,skinniderlab
\ No newline at end of file
+  partition=main,hoppertest,skinniderlab
diff --git a/snakemake/profiles/slurm-status.py b/snakemake/profiles/slurm-status.py
index 831d4149..8eb0cbc8 100644
--- a/snakemake/profiles/slurm-status.py
+++ b/snakemake/profiles/slurm-status.py
@@ -5,6 +5,7 @@
 import sys
 import time
 import logging
+
 logger = logging.getLogger("__name__")
 
 STATUS_ATTEMPTS = 20
@@ -14,16 +15,21 @@
 for i in range(STATUS_ATTEMPTS):
     try:
         sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
-        res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
+        res = {
+            x.split("|")[0]: x.split("|")[1]
+            for x in sacct_res.decode().strip().split("\n")
+        }
         break
     except sp.CalledProcessError as e:
         logger.error("sacct process error")
         logger.error(e)
-    except IndexError as e:
+    except IndexError:
         pass
     # Try getting job with scontrol instead in case sacct is misconfigured
     try:
-        sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid)))
+        sctrl_res = sp.check_output(
+            shlex.split("scontrol -o show job {}".format(jobid))
+        )
         m = re.search("JobState=(\w+)", sctrl_res.decode())
         res = {jobid: m.group(1)}
         break
@@ -38,26 +44,26 @@
 
 status = res[jobid]
 
-if (status == "BOOT_FAIL"):
+if status == "BOOT_FAIL":
     print("failed")
-elif (status == "OUT_OF_MEMORY"):
+elif status == "OUT_OF_MEMORY":
     print("failed")
-elif (status.startswith("CANCELLED")):
+elif status.startswith("CANCELLED"):
     print("failed")
-elif (status == "COMPLETED"):
+elif status == "COMPLETED":
     print("success")
-elif (status == "DEADLINE"):
+elif status == "DEADLINE":
     print("failed")
-elif (status == "FAILED"):
+elif status == "FAILED":
     print("failed")
-elif (status == "NODE_FAIL"):
+elif status == "NODE_FAIL":
     print("failed")
-elif (status == "PREEMPTED"):
+elif status == "PREEMPTED":
     print("failed")
-elif (status == "TIMEOUT"):
+elif status == "TIMEOUT":
     print("failed")
 # Unclear whether SUSPENDED should be treated as running or failed
-elif (status == "SUSPENDED"):
+elif status == "SUSPENDED":
     print("failed")
 else:
-    print("running")
\ No newline at end of file
+    print("running")

From 51b7bba289def970f7aba2724a5b987ea640b5ed Mon Sep 17 00:00:00 2001
From: Anushka Acharya <aa9078@princeton.edu>
Date: Tue, 30 Apr 2024 09:48:52 -0400
Subject: [PATCH 4/5] minor change

---
 snakemake/Snakefile            | 2 +-
 snakemake/profiles/config.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/snakemake/Snakefile b/snakemake/Snakefile
index 466105d0..2c46a3dc 100644
--- a/snakemake/Snakefile
+++ b/snakemake/Snakefile
@@ -195,7 +195,7 @@ rule sample_molecules_RNN:
     output:
         output_file=f"{OUTPUT_DIR}/{{enum_factor}}/prior/samples/{{dataset}}_{{repr}}_{{fold}}_{{train_seed}}_{{sample_seed}}_samples.csv"
     resources:
-        mem_mb=2000,
+        mem_mb=16000,
         runtime=15+MODEL_PARAMS["sample_mols"]//10000,
         slurm_extra="--gres=gpu:1"
     shell:
diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
index c85509a7..7efb55c1 100644
--- a/snakemake/profiles/config.yaml
+++ b/snakemake/profiles/config.yaml
@@ -8,7 +8,7 @@ printshellcmds: true
 cluster:
   sbatch 
     --cpus-per-task={threads} 
-    --mem={resources.mem_mb}
+    --mem={resources.mem_mb}M 
     --time={resources.runtime} --output=slurm_out/%x-%A 
     --job-name={rule} --parsable
     --partition={resources.partition}

From 0cecbd73e182b96cf5cd98f235b10ff1743a1f44 Mon Sep 17 00:00:00 2001
From: anushka255 <aa9078@princeton.edu>
Date: Tue, 30 Apr 2024 11:21:37 -0400
Subject: [PATCH 5/5] minor tweaks

---
 snakemake/Snakefile                 |  2 +-
 snakemake/Snakefile_model_eval      |  4 ++--
 snakemake/config.json               | 12 ++++++------
 snakemake/profiles/config.yaml      |  8 ++++----
 src/clm/plot/train_discriminator.py |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/snakemake/Snakefile b/snakemake/Snakefile
index 2c46a3dc..e452a249 100644
--- a/snakemake/Snakefile
+++ b/snakemake/Snakefile
@@ -195,7 +195,7 @@ rule sample_molecules_RNN:
     output:
         output_file=f"{OUTPUT_DIR}/{{enum_factor}}/prior/samples/{{dataset}}_{{repr}}_{{fold}}_{{train_seed}}_{{sample_seed}}_samples.csv"
     resources:
-        mem_mb=16000,
+        mem_mb=1000,
         runtime=15+MODEL_PARAMS["sample_mols"]//10000,
         slurm_extra="--gres=gpu:1"
     shell:
diff --git a/snakemake/Snakefile_model_eval b/snakemake/Snakefile_model_eval
index 10b4a1c8..6a3ea0ac 100644
--- a/snakemake/Snakefile_model_eval
+++ b/snakemake/Snakefile_model_eval
@@ -25,8 +25,8 @@ rule all:
     input:
         calculate_outcomes_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_calculate_outcomes.csv",
             fold=range(FOLDS),repr=REPRESENTATIONS,dataset=DATASET,enum_factor=ENUM_FACTORS),
-        # nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
-          #  repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
+        nn_tc_file=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_write_nn_tc.csv",
+           repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         train_discriminator=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_train_discriminator_.csv",
             repr=REPRESENTATIONS,fold=range(FOLDS),dataset=DATASET,enum_factor=ENUM_FACTORS),
         freq_distribution=expand(f"{OUTPUT_DIR}/model_evaluation/{{enum_factor}}/{{dataset}}_{{repr}}_{{fold}}_freq_distribution.csv",
diff --git a/snakemake/config.json b/snakemake/config.json
index 7f13d609..aa9f806c 100644
--- a/snakemake/config.json
+++ b/snakemake/config.json
@@ -1,13 +1,13 @@
 {
-  "output_dir": "/Genomics/argo/users/aa9078/PED_1",
-  "dataset": "/Genomics/argo/users/aa9078/data/prior/raw/PED.csv",
-  "pubchem_tsv_file": "/Genomics/singhlab/vineetb/CLM/snakemake/data/PubChem_with_fps.tsv",
+  "output_dir": "data",
+  "dataset": "/path/to/<dataset>.txt",
+  "pubchem_tsv_file": "/path/to/PubChem.tsv",
 
   "representations": ["SMILES"],
   "folds": 10,
-  "train_seeds": [0],
+  "train_seeds": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
   "sample_seeds": [0],
-  "enum_factors": [0, 1],
+  "enum_factors": [0, 10, 30, 50, 100],
   "max_input_smiles": 0,
 
   "model_params": {
@@ -30,5 +30,5 @@
   "min_tc": 0,
   "top_k": 30,
   "err_ppm": 10,
-  "random_seed": 12345
+  "random_seed": null
 }
diff --git a/snakemake/profiles/config.yaml b/snakemake/profiles/config.yaml
index 7efb55c1..f7b911b7 100644
--- a/snakemake/profiles/config.yaml
+++ b/snakemake/profiles/config.yaml
@@ -6,10 +6,10 @@ printshellcmds: true
 
 # cluster specific settings
 cluster:
-  sbatch 
-    --cpus-per-task={threads} 
-    --mem={resources.mem_mb}M 
-    --time={resources.runtime} --output=slurm_out/%x-%A 
+  sbatch
+    --cpus-per-task={threads}
+    --mem={resources.mem_mb}M
+    --time={resources.runtime} --output=slurm_out/%x-%A
     --job-name={rule} --parsable
     --partition={resources.partition}
 cluster-status: "slurm-status.py"
diff --git a/src/clm/plot/train_discriminator.py b/src/clm/plot/train_discriminator.py
index 2f99a954..32729308 100644
--- a/src/clm/plot/train_discriminator.py
+++ b/src/clm/plot/train_discriminator.py
@@ -86,7 +86,7 @@ def plot(outcome_dir, output_dir):
     # Make output directory if it doesn't exist yet
     os.makedirs(output_dir, exist_ok=True)
 
-    outcome_files = glob.glob(f"{outcome_dir}/*train_discriminator_.csv")
+    outcome_files = glob.glob(f"{outcome_dir}/*train_discriminator.csv")
     outcome = pd.concat(
         [pd.read_csv(outcome_file, delimiter=",") for outcome_file in outcome_files]
     )