Skip to content
This repository was archived by the owner on May 2, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
removing human and other contaminating sequence from high-throughput
DNA sequence data.

* python3.5 version
* works for both paired-end and single-end reads

Alignment tools supported:

* bwa
Expand All @@ -19,4 +22,3 @@ Tools to be added later:

* BMTagger
* Blat

14 changes: 9 additions & 5 deletions decontamlib/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
# "bwa_fp":"bwa",
# "num_threads":8
# }

#
# if user_config_file is None:
# if organism == "human":
# default_user_config_fp = os.path.expanduser("~/.decontam_human.json")
# elif organism == "phix":
# default_user_config_fp = os.path.expanduser("~/.decontam_phix.json")
# if os.path.exists(default_user_config_fp):
# user_config_file = open(default_user_config_fp)

#
# if user_config_file is not None:
# user_config = json.load(user_config_file)
# config.update(user_config)
Expand All @@ -53,7 +53,7 @@ def human_filter_main(argv=None):
type=argparse.FileType("r"),
help="FASTQ file of forward reads")
p.add_argument(
"--reverse-reads", required=True,
"--reverse-reads", required=False,
type=argparse.FileType("r"),
help="FASTQ file of reverse reads")
p.add_argument(
Expand Down Expand Up @@ -105,9 +105,13 @@ def human_filter_main(argv=None):
config = get_config(args, args.organism)

fwd_fp = args.forward_reads.name
rev_fp = args.reverse_reads.name
args.forward_reads.close()
args.reverse_reads.close()

if args.reverse_reads is None:
rev_fp = None
else:
rev_fp = args.reverse_reads.name
args.reverse_reads.close()

if args.sam_file is not None:
config["method"] = "samfile"
Expand Down
13 changes: 8 additions & 5 deletions decontamlib/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import shutil
import subprocess
import tempfile
import decontamlib.utils as utils


import decontamlib as utils
from decontamlib.fastq import FastqSplitter
from decontamlib.sam import get_mapped_reads

Expand All @@ -36,8 +35,9 @@ def decontaminate(self, fwd_fp, rev_fp, output_dir, organism, pct, frac):
annotations = self.annotate(fwd_fp, rev_fp, pct, frac, output_dir)
with FastqSplitter(fwd_fp, output_dir) as s:
s.partition(annotations, organism)
with FastqSplitter(rev_fp, output_dir) as s:
s.partition(annotations, organism)
if rev_fp is not None:
with FastqSplitter(rev_fp, output_dir) as s:
s.partition(annotations, organism)
summary_data = summarize_annotations(annotations)
return summary_data

Expand Down Expand Up @@ -83,7 +83,10 @@ def annotate(self, R1, R2, pct, frac, output_dir):
return [(id, True if id in mapped else False) for id in ids]

def _command(self, fwd_fp, rev_fp):
return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp]
if rev_fp is None:
return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp]
else:
return [self.bwa_fp, "mem", "-M", "-t", str(self.num_threads), self.index, fwd_fp, rev_fp]

def _run(self, R1, R2, output_dir):
if self.keep_sam_file:
Expand Down
2 changes: 1 addition & 1 deletion scripts/decontaminate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/python
#!/usr/bin/env python
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be able to remove the scripts directory, and instead have setup.py make the scripts for us. If we make this switch, Python will wire up the correct interpreter for the scripts it creates.

For an example, see the console_scripts argument here:
https://github.com/kylebittinger/unassigner/blob/master/setup.py

from decontamlib.main import human_filter_main
human_filter_main()
2 changes: 1 addition & 1 deletion scripts/make_index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/python
#!/usr/bin/env python
from decontamlib.main import make_index_main
make_index_main()
46 changes: 46 additions & 0 deletions tests/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
from subprocess import call
import sys, os

#define abs path
path = os.path.abspath(os.path.dirname(sys.argv[0]))
data_path = path + "/data"

# create directories for summary files and output files if they don't exist
log_path = data_path + "/log"
if not os.path.exists(log_path):
os.mkdir(log_path)

output_path = data_path + "/output"
if not os.path.exists(output_path):
os.mkdir(output_path)

# for pct between 0.0 and 1.0
for p in map(lambda x: x/10.0,range(0,11)):
#for frac between 0.0 and 1.0
for f in map(lambda x: x/10.0,range(0,11)):
# print pct and frac
print("pct: " + str(p) + " frac: " + str(f))
# name summary files
hum_sum_file = "{}/humanseqs_{}_{}.json".format(log_path, p, f)
nonhum_sum_file = "{}/nonhumanseqs_{}_{}.json".format(log_path, p, f)
# run decontam over human seqs
call(["decontaminate.py", "--forward-reads", data_path + "/humanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", hum_sum_file])
# run decontam over nonhuman seqs
call(["decontaminate.py", "--forward-reads", data_path + "/nonhumanseqs.fastq", "--organism", "human", "--pct", str(p), "--frac", str(f), "--output-dir", output_path, "--summary-file", nonhum_sum_file])
# from human seqs summary file, read and print number of true positive ("true" in data) and false negatives ("false" in data)
# from nonhuman seqs summary file, read and print number of false positives ("true" in data) and true negatives ("false" in data)
message = ["tru", "fal", "tru"]
for i, file in enumerate([hum_sum_file, nonhum_sum_file]):
with open(file) as f:
data = json.loads(f.read())["data"]
mess = "{} pos: ".format(message[i])
if "true" in data:
print("{}{}".format(mess, data["true"]))
else:
print("{}0".format(mess))
mess = "{} neg: ".format(message[i+1])
if "false" in data:
print("{}{}".format(mess, data["false"]))
else:
print("{}0".format(mess))
41 changes: 30 additions & 11 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,17 @@ def tearDown(self):

def test_with_sam_file(self):
sam_file = tempfile.NamedTemporaryFile(suffix=".sam", mode="w")
sam_file.write(b5_sam)
sam_file.write(b5_sam.encode())
sam_file.seek(0)
self.args.extend(["--sam-file", sam_file.name])

# Set executable for BWA to "false"
# This program always returns non-zero exit status
self.args.extend(["--method", "bwa"])
self.args.extend(["--bwa_fp", "false"])
config_file = tempfile.NamedTemporaryFile(suffix=".json")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Config files are probably more trouble than they're worth. Let's not remove the config files now, but just regard this as a sign that things will be much easier once we do.

config = {"method":"bwa", "bwa_fp":"false"}
config_file.write(json.dumps(config).encode())
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

human_filter_main(self.args)

Expand All @@ -82,8 +85,10 @@ def test_with_sam_file(self):

def test_keep_sam_file(self):
index_fp = os.path.join(data_dir, "fakehuman")
self.args.extend(["--method", "bwa"])
self.args.extend(["--index", index_fp])
config_file = tempfile.NamedTemporaryFile(suffix=".json")
config_file.write(json.dumps({'method':'bwa', 'index':index_fp}).encode())
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

self.args.extend(["--keep-sam-file"])
human_filter_main(self.args)
Expand All @@ -92,7 +97,11 @@ def test_keep_sam_file(self):


def test_all_human(self):
self.args.extend(["--method", "all_human"])
config_file = tempfile.NamedTemporaryFile(suffix=".json")
config_file.write(json.dumps({"method": "all_human"}).encode())
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

human_filter_main(self.args)

with open(self.summary_fp) as f:
Expand All @@ -109,7 +118,11 @@ def test_all_human(self):


def test_no_human(self):
self.args.extend(["--method", "no_human"])
config_file = tempfile.NamedTemporaryFile(suffix=".json")
config_file.write(str.encode(json.dumps({"method": "no_human"})))
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

human_filter_main(self.args)

with open(self.summary_fp) as f:
Expand All @@ -127,8 +140,11 @@ def test_no_human(self):

def test_bowtie(self):
index_fp = os.path.join(data_dir, "fakehuman")
self.args.extend(["--method", "bowtie2"])
self.args.extend(["--index", index_fp])
index_fp = os.path.join(data_dir, "fakehuman")
config_file = tempfile.NamedTemporaryFile(suffix=".json")
config_file.write(json.dumps({"method": "bowtie2", "index": index_fp}).encode())
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

human_filter_main(self.args)

Expand All @@ -141,8 +157,11 @@ def test_bowtie(self):

def test_bwa(self):
index_fp = os.path.join(data_dir, "fakehuman")
self.args.extend(["--method", "bwa"])
self.args.extend(["--index", index_fp])
index_fp = os.path.join(data_dir, "fakehuman")
config_file = tempfile.NamedTemporaryFile(suffix=".json")
config_file.write(json.dumps({"method": "bwa", "index": index_fp}).encode())
config_file.seek(0)
self.args.extend(["--config-file", config_file.name])

human_filter_main(self.args)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class TestIdExtract(unittest.TestCase):
def setUp(self):
self.fastq = tempfile.NamedTemporaryFile(mode="w")
self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n")
self.fastq.write("@1\nACG\n+\nEEE\n@2\nTTTT\n+\nEEEE\n@3\nANNN\n+\nEEEE\n".encode())
self.fastq.seek(0)

def test_ids(self):
Expand Down Expand Up @@ -40,10 +40,10 @@ def setUp(self):
self.wide_annotation = utils.add_tool_sample("tool", "sample", self.human_annotation)

def test_number_or_rows(self):
self.assertEqual(len(self.human_annotation), len(list(self.wide_annotation)))
self.assertEqual(len(list(self.human_annotation)), len(list(self.wide_annotation)))

def test_number_or_columns(self):
self.assertEqual(len(self.human_annotation[0]) + 2, len(list(self.wide_annotation)[0]))
self.assertEqual(len(list(self.human_annotation)[0]) + 2, len(list(self.wide_annotation)[0]))


if __name__ == "__main__":
Expand Down