diff --git a/README.md b/README.md index 95b3a02..fdf2204 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ removing human and other contaminating sequence from high-throughput DNA sequence data. +* python3.5 version +* works for both paired-end and single-end reads + Alignment tools supported: * bwa @@ -19,4 +22,3 @@ Tools to be added later: * BMTagger * Blat - diff --git a/decontamlib/main.py b/decontamlib/main.py index 6070ea9..98b27b9 100644 --- a/decontamlib/main.py +++ b/decontamlib/main.py @@ -19,7 +19,7 @@ # "bwa_fp":"bwa", # "num_threads":8 # } - +# # if user_config_file is None: # if organism == "human": # default_user_config_fp = os.path.expanduser("~/.decontam_human.json") @@ -27,7 +27,7 @@ # default_user_config_fp = os.path.expanduser("~/.decontam_phix.json") # if os.path.exists(default_user_config_fp): # user_config_file = open(default_user_config_fp) - +# # if user_config_file is not None: # user_config = json.load(user_config_file) # config.update(user_config) @@ -53,7 +53,7 @@ def human_filter_main(argv=None): type=argparse.FileType("r"), help="FASTQ file of forward reads") p.add_argument( - "--reverse-reads", required=True, + "--reverse-reads", required=False, type=argparse.FileType("r"), help="FASTQ file of reverse reads") p.add_argument( @@ -105,9 +105,13 @@ def human_filter_main(argv=None): config = get_config(args, args.organism) fwd_fp = args.forward_reads.name - rev_fp = args.reverse_reads.name args.forward_reads.close() - args.reverse_reads.close() + + if args.reverse_reads is None: + rev_fp = None + else: + rev_fp = args.reverse_reads.name + args.reverse_reads.close() if args.sam_file is not None: config["method"] = "samfile" diff --git a/decontamlib/tools.py b/decontamlib/tools.py index a0190bf..fc6a42b 100644 --- a/decontamlib/tools.py +++ b/decontamlib/tools.py @@ -7,9 +7,8 @@ import shutil import subprocess import tempfile -import decontamlib.utils as utils - +import decontamlib as utils from decontamlib.fastq import FastqSplitter from decontamlib.sam import get_mapped_reads @@ -36,8 +35,9 @@ def decontaminate(self, fwd_fp, rev_fp, output_dir, organism, pct, frac): annotations = self.annotate(fwd_fp, rev_fp, pct, frac, output_dir) with FastqSplitter(fwd_fp, output_dir) as s: s.partition(annotations, organism) - with FastqSplitter(rev_fp, output_dir) as s: - s.partition(annotations, organism) + if rev_fp is not None: + with FastqSplitter(rev_fp, output_dir) as s: + s.partition(annotations, organism) summary_data = summarize_annotations(annotations) return summary_data @@ -83,7 +83,10 @@ def annotate(self, R1, R2, pct, frac, output_dir): return [(id, True if id in mapped else False) for id in ids] def _command(self, fwd_fp, rev_fp): - return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp] + if rev_fp is None: + return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp] + else: + return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp] def _run(self, R1, R2, output_dir): if self.keep_sam_file: diff --git a/scripts/decontaminate.py b/scripts/decontaminate.py index e43b968..e38a393 100755 --- a/scripts/decontaminate.py +++ b/scripts/decontaminate.py @@ -1,3 +1,3 @@ -#!/usr/bin/python +#!/usr/bin/env python from decontamlib.main import human_filter_main human_filter_main() diff --git a/scripts/make_index.py b/scripts/make_index.py index a5c183d..7d54363 100644 --- a/scripts/make_index.py +++ b/scripts/make_index.py @@ -1,3 +1,3 @@ -#!/usr/bin/python +#!/usr/bin/env python from decontamlib.main import make_index_main make_index_main() diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..6d35d7e --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,46 @@ +import json +from subprocess import call +import sys, os + +#define abs path +path = os.path.abspath(os.path.dirname(sys.argv[0])) +data_path = path + "/data" + +# create directories for summary files and output files if they don't exist +log_path = data_path + "/log" +if not os.path.exists(log_path): + os.mkdir(log_path) + +output_path = data_path + "/output" +if not os.path.exists(output_path): + os.mkdir(output_path) + +# for pct between 0.0 and 1.0 +for p in map(lambda x: x/10.0,range(0,11)): + #for frac between 0.0 and 1.0 + for f in map(lambda x: x/10.0,range(0,11)): + # print pct and frac + print("pct: " + str(p) + " frac: " + str(f)) + # name summary files + hum_sum_file = "{}/humanseqs_{}_{}.json".format(log_path, p, f) + nonhum_sum_file = "{}/nonhumanseqs_{}_{}.json".format(log_path, p, f) + # run decontam over human seqs + call(["decontaminate.py", "--forward-reads", data_path + "/humanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", hum_sum_file]) + # run decontam over nonhuman seqs + call(["decontaminate.py", "--forward-reads", data_path + "/nonhumanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", nonhum_sum_file]) + # from human seqs summary file, read and print number of true positive ("true" in data) and false negatives ("false" in data) + # from nonhuman seqs summary file, read and print number of false positives ("true" in data) and true negatives ("false" in data) + message = ["tru", "fal", "tru"] + for i, file in enumerate([hum_sum_file, nonhum_sum_file]): + with open(file) as f: + data = json.loads(f.read())["data"] + mess = "{} pos: ".format(message[i]) + if "true" in data: + print("{}{}".format(mess, data["true"])) + else: + print("{}0".format(mess)) + mess = "{} neg: ".format(message[i+1]) + if "false" in data: + print("{}{}".format(mess, data["false"])) + else: + print("{}0".format(mess)) diff --git a/tests/test_main.py b/tests/test_main.py index 67c8039..d9150c3 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -62,14 +62,17 @@ def tearDown(self): def test_with_sam_file(self): sam_file = tempfile.NamedTemporaryFile(suffix=".sam", mode="w") - sam_file.write(b5_sam) + sam_file.write(b5_sam.encode()) sam_file.seek(0) self.args.extend(["--sam-file", sam_file.name]) # Set executable for BWA to "false" # This program always returns non-zero exit status - self.args.extend(["--method", "bwa"]) - self.args.extend(["--bwa_fp", "false"]) + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config = {"method":"bwa", "bwa_fp":"false"} + config_file.write(json.dumps(config).encode()) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) human_filter_main(self.args) @@ -82,8 +85,10 @@ def test_with_sam_file(self): def test_keep_sam_file(self): index_fp = os.path.join(data_dir, "fakehuman") - self.args.extend(["--method", "bwa"]) - self.args.extend(["--index", index_fp]) + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config_file.write(json.dumps({'method':'bwa', 'index':index_fp}).encode()) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) self.args.extend(["--keep-sam-file"]) human_filter_main(self.args) @@ -92,7 +97,11 @@ def test_keep_sam_file(self): def test_all_human(self): - self.args.extend(["--method", "all_human"]) + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config_file.write(json.dumps({"method": "all_human"}).encode()) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) + human_filter_main(self.args) with open(self.summary_fp) as f: @@ -109,7 +118,11 @@ def test_all_human(self): def test_no_human(self): - self.args.extend(["--method", "no_human"]) + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config_file.write(str.encode(json.dumps({"method": "no_human"}))) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) + human_filter_main(self.args) with open(self.summary_fp) as f: @@ -127,8 +140,11 @@ def test_no_human(self): def test_bowtie(self): index_fp = os.path.join(data_dir, "fakehuman") - self.args.extend(["--method", "bowtie2"]) - self.args.extend(["--index", index_fp]) + index_fp = os.path.join(data_dir, "fakehuman") + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config_file.write(json.dumps({"method": "bowtie2", "index": index_fp}).encode()) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) human_filter_main(self.args) @@ -141,8 +157,11 @@ def test_bowtie(self): def test_bwa(self): index_fp = os.path.join(data_dir, "fakehuman") - self.args.extend(["--method", "bwa"]) - self.args.extend(["--index", index_fp]) + index_fp = os.path.join(data_dir, "fakehuman") + config_file = tempfile.NamedTemporaryFile(suffix=".json") + config_file.write(json.dumps({"method": "bwa", "index": index_fp}).encode()) + config_file.seek(0) + self.args.extend(["--config-file", config_file.name]) human_filter_main(self.args) diff --git a/tests/test_utils.py b/tests/test_utils.py index a09d480..cf9b565 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,7 @@ class TestIdExtract(unittest.TestCase): def setUp(self): self.fastq = tempfile.NamedTemporaryFile(mode="w") - self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n") + self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n".encode()) self.fastq.seek(0) def test_ids(self): @@ -40,10 +40,10 @@ def setUp(self): self.wide_annotation = utils.add_tool_sample("tool", "sample", self.human_annotation) def test_number_or_rows(self): - self.assertEqual(len(self.human_annotation), len(list(self.wide_annotation))) + self.assertEqual(len(list(self.human_annotation)), len(list(self.wide_annotation))) def test_number_or_columns(self): - self.assertEqual(len(self.human_annotation[0]) + 2, len(list(self.wide_annotation)[0])) + self.assertEqual(len(list(self.human_annotation)[0]) + 2, len(list(self.wide_annotation)[0])) if __name__ == "__main__":