PennChopMicrobiomeProgram · zhaoc1 · Jan 17, 2018 · Jan 17, 2018 · Jan 17, 2018 · Jan 17, 2018
diff --git a/README.md b/README.md
@@ -4,6 +4,9 @@
 removing human and other contaminating sequence from high-throughput
 DNA sequence data.
 
+* python3.5 version
+* works for both paired-end and single-end reads
+
 Alignment tools supported:
 
 * bwa
@@ -19,4 +22,3 @@ Tools to be added later:
 
 * BMTagger
 * Blat
-
diff --git a/decontamlib/main.py b/decontamlib/main.py
@@ -19,15 +19,15 @@
 #         "bwa_fp":"bwa",
 #         "num_threads":8
 #     }
-
+#
 #     if user_config_file is None:
 #         if organism == "human":
 #             default_user_config_fp = os.path.expanduser("~/.decontam_human.json")
 #         elif organism == "phix":
 #             default_user_config_fp = os.path.expanduser("~/.decontam_phix.json")
 #         if os.path.exists(default_user_config_fp):
 #             user_config_file = open(default_user_config_fp)
-
+#
 #     if user_config_file is not None:
 #         user_config = json.load(user_config_file)
 #         config.update(user_config)
@@ -53,7 +53,7 @@ def human_filter_main(argv=None):
         type=argparse.FileType("r"),
         help="FASTQ file of forward reads")
     p.add_argument(
-        "--reverse-reads", required=True,
+        "--reverse-reads", required=False,
         type=argparse.FileType("r"),
         help="FASTQ file of reverse reads")
     p.add_argument(
@@ -105,9 +105,13 @@ def human_filter_main(argv=None):
     config = get_config(args, args.organism)
 
     fwd_fp = args.forward_reads.name
-    rev_fp = args.reverse_reads.name
     args.forward_reads.close()
-    args.reverse_reads.close()
+
+    if args.reverse_reads is None:
+        rev_fp = None
+    else:
+        rev_fp = args.reverse_reads.name
+        args.reverse_reads.close()
 
     if args.sam_file is not None:
         config["method"] = "samfile"

diff --git a/decontamlib/tools.py b/decontamlib/tools.py
@@ -7,9 +7,8 @@
 import shutil
 import subprocess
 import tempfile
-import decontamlib.utils as utils
-
 
+import decontamlib as utils
 from decontamlib.fastq import FastqSplitter
 from decontamlib.sam import get_mapped_reads
 
@@ -36,8 +35,9 @@ def decontaminate(self, fwd_fp, rev_fp, output_dir, organism, pct, frac):
         annotations = self.annotate(fwd_fp, rev_fp, pct, frac, output_dir)
         with FastqSplitter(fwd_fp, output_dir) as s:
             s.partition(annotations, organism)
-        with FastqSplitter(rev_fp, output_dir) as s:
-            s.partition(annotations, organism)
+        if rev_fp is not None:
+            with FastqSplitter(rev_fp, output_dir) as s:
+                s.partition(annotations, organism)
         summary_data = summarize_annotations(annotations)
         return summary_data
 
@@ -83,7 +83,10 @@ def annotate(self, R1, R2, pct, frac, output_dir):
         return [(id, True if id in mapped else False) for id in ids]
 
     def _command(self, fwd_fp, rev_fp):
-        return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp]
+        if rev_fp is None:
+            return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp]
+        else:
+            return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp]
 
     def _run(self, R1, R2, output_dir):
         if self.keep_sam_file:

diff --git a/scripts/decontaminate.py b/scripts/decontaminate.py
@@ -1,3 +1,3 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 from decontamlib.main import human_filter_main
 human_filter_main()
diff --git a/scripts/make_index.py b/scripts/make_index.py
@@ -1,3 +1,3 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 from decontamlib.main import make_index_main
 make_index_main()
diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -0,0 +1,46 @@
+import json
+from subprocess import call
+import sys, os
+
+#define abs path
+path = os.path.abspath(os.path.dirname(sys.argv[0]))
+data_path = path + "/data"
+
+# create directories for summary files and output files if they don't exist
+log_path = data_path + "/log"
+if not os.path.exists(log_path):
+    os.mkdir(log_path)
+
+output_path = data_path + "/output"
+if not os.path.exists(output_path):
+    os.mkdir(output_path)
+
+# for pct between 0.0 and 1.0
+for p in map(lambda x: x/10.0,range(0,11)):
+    #for frac between 0.0 and 1.0
+    for f in map(lambda x: x/10.0,range(0,11)):
+        # print pct and frac
+        print("pct: " + str(p) + " frac: " + str(f))
+        # name summary files
+        hum_sum_file = "{}/humanseqs_{}_{}.json".format(log_path, p, f)
+        nonhum_sum_file = "{}/nonhumanseqs_{}_{}.json".format(log_path, p, f)
+        # run decontam over human seqs
+        call(["decontaminate.py", "--forward-reads", data_path + "/humanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", hum_sum_file])
+        # run decontam over nonhuman seqs
+        call(["decontaminate.py", "--forward-reads", data_path + "/nonhumanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", nonhum_sum_file])
+        # from human seqs summary file, read and print number of true positive ("true" in data) and false negatives ("false" in data)  
+        # from nonhuman seqs summary file, read and print number of false positives ("true" in data) and true negatives ("false" in data)
+        message = ["tru", "fal", "tru"]
+        for i, file in enumerate([hum_sum_file, nonhum_sum_file]):
+            with open(file) as f:
+                data = json.loads(f.read())["data"]
+                mess = "{} pos: ".format(message[i])
+                if "true" in data:
+                    print("{}{}".format(mess, data["true"]))
+                else:
+                    print("{}0".format(mess))    
+                mess = "{} neg: ".format(message[i+1])
+                if "false" in data:
+                    print("{}{}".format(mess, data["false"]))
+                else:
+                    print("{}0".format(mess))
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -62,14 +62,17 @@ def tearDown(self):
 
     def test_with_sam_file(self):
         sam_file = tempfile.NamedTemporaryFile(suffix=".sam", mode="w")
-        sam_file.write(b5_sam)
+        sam_file.write(b5_sam.encode())
         sam_file.seek(0)
         self.args.extend(["--sam-file", sam_file.name])
 
         # Set executable for BWA to "false"
         # This program always returns non-zero exit status
-        self.args.extend(["--method", "bwa"])
-        self.args.extend(["--bwa_fp", "false"])
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config = {"method":"bwa", "bwa_fp":"false"}
+        config_file.write(json.dumps(config).encode())
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
 
         human_filter_main(self.args)
 
@@ -82,8 +85,10 @@ def test_with_sam_file(self):
 
     def test_keep_sam_file(self):
         index_fp = os.path.join(data_dir, "fakehuman")
-        self.args.extend(["--method", "bwa"])
-        self.args.extend(["--index", index_fp])
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config_file.write(json.dumps({'method':'bwa', 'index':index_fp}).encode())
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
 
         self.args.extend(["--keep-sam-file"])
         human_filter_main(self.args)
@@ -92,7 +97,11 @@ def test_keep_sam_file(self):
 
 
     def test_all_human(self):
-        self.args.extend(["--method", "all_human"])
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config_file.write(json.dumps({"method": "all_human"}).encode())
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
+
         human_filter_main(self.args)
 
         with open(self.summary_fp) as f:
@@ -109,7 +118,11 @@ def test_all_human(self):
 
 
     def test_no_human(self):
-        self.args.extend(["--method", "no_human"])
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config_file.write(str.encode(json.dumps({"method": "no_human"})))
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
+
         human_filter_main(self.args)
 
         with open(self.summary_fp) as f:
@@ -127,8 +140,11 @@ def test_no_human(self):
 
     def test_bowtie(self):
         index_fp = os.path.join(data_dir, "fakehuman")
-        self.args.extend(["--method", "bowtie2"])
-        self.args.extend(["--index", index_fp])
+        index_fp = os.path.join(data_dir, "fakehuman")
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config_file.write(json.dumps({"method": "bowtie2", "index": index_fp}).encode())
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
 
         human_filter_main(self.args)
 
@@ -141,8 +157,11 @@ def test_bowtie(self):
 
     def test_bwa(self):
         index_fp = os.path.join(data_dir, "fakehuman")
-        self.args.extend(["--method", "bwa"])
-        self.args.extend(["--index", index_fp])
+        index_fp = os.path.join(data_dir, "fakehuman")
+        config_file = tempfile.NamedTemporaryFile(suffix=".json")
+        config_file.write(json.dumps({"method": "bwa", "index": index_fp}).encode())
+        config_file.seek(0)
+        self.args.extend(["--config-file", config_file.name])
 
         human_filter_main(self.args)
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -11,7 +11,7 @@
 class TestIdExtract(unittest.TestCase):
     def setUp(self):
         self.fastq = tempfile.NamedTemporaryFile(mode="w")
-        self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n") 
+        self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n".encode()) 
         self.fastq.seek(0)
 
     def test_ids(self):
@@ -40,10 +40,10 @@ def setUp(self):
         self.wide_annotation = utils.add_tool_sample("tool", "sample", self.human_annotation)
 
     def test_number_or_rows(self):
-        self.assertEqual(len(self.human_annotation), len(list(self.wide_annotation)))
+        self.assertEqual(len(list(self.human_annotation)), len(list(self.wide_annotation)))
 
     def test_number_or_columns(self):
-        self.assertEqual(len(self.human_annotation[0]) + 2, len(list(self.wide_annotation)[0]))
+        self.assertEqual(len(list(self.human_annotation)[0]) + 2, len(list(self.wide_annotation)[0]))
 
 
 if __name__ == "__main__":