From 2ff7a83e90b4f0cfb6f2f24ad2ee2774594c1847 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Thu, 21 Nov 2019 13:29:05 -0500 Subject: [PATCH 1/7] Parametrize read simulation --- treetoreads.py | 119 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 35 deletions(-) diff --git a/treetoreads.py b/treetoreads.py index 96f9ca9..9c61fb4 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -43,7 +43,7 @@ def __init__(self, configfi, run=1, main=None): else: self.prefix = '' if self.run: - if (self.no_art == 0) and (self.config.get('coverage')): + if (self.no_art == 0) and all([self.config.get(x) for x in ['coverage_low', 'coverage_high', 'coverage_step']]): self.run_art() else: sys.stdout.write("simulating genomes but not reads\n") @@ -98,7 +98,9 @@ def read_args(self): 'base_genome_path', 'rate_matrix', 'freq_matrix', - 'coverage', + 'coverage_low', + 'coverage_high', + 'coverage_step', 'prefix', 'output_dir', 'mutation_clustering', @@ -106,10 +108,16 @@ def read_args(self): 'exponential_mean', 'gamma_shape', 'read_length', - 'fragment_size', - 'stdev_frag_size', - 'error_model1', - 'error_model2', + 'frag_low', + 'frag_high', + 'frag_step', + 'stdev_frag_low', + 'stdev_frag_high', + 'stdev_frag_step', + 'qs2_low', + 'qs2_high', + 'qs2_step', + 'MiSeqVersions', 'indel_model', 'indel_rate'] for lin in config: @@ -154,9 +162,21 @@ def _check_args(self): 'base_name':'base_genome_name', 'genome':'base_genome_path', 'ratemat':'rate_matrix', - 'cov':'coverage', - 'outd':'output_dir' - } + 'outd':'output_dir', + 'coverage_low' : 'coverage_low', + 'coverage_high' : 'coverage_high', + 'coverage_step' : 'coverage_step', + 'read_length' : 'read_length', + 'frag_low' : 'frag_low', + 'frag_high' : 'frag_high', + 'frag_step' : 'frag_step', + 'stdev_frag_low' : 'stdev_frag_low', + 'stdev_frag_high' : 'stdev_frag_high', + 'stdev_frag_step' : 'stdev_frag_step', + 'qs2_low' : 'qs2_low', + 'qs2_high' : 'qs2_high', + 'qs2_step' : 'qs2_step', + } for arg in self.argdict: if self.argdict[arg] not in self.config: sys.stderr.write("{} is missing from the config file".format(self.argdict[arg])) @@ -688,11 +708,26 @@ def run_art(self, coverage=None): else: self.mut_genomes_no_indels() if coverage is None: - coverarg = self.get_arg('cov') + cov_low = int(self.get_arg('coverage_low')) + cov_hi = int(self.get_arg('coverage_high')) + cov_step = int(self.get_arg('coverage_step')) + frag_low = int(self.get_arg('frag_low')) + frag_hi = int(self.get_arg('frag_high')) + frag_step = int(self.get_arg('frag_step')) + stdev_low = int(self.get_arg('stdev_frag_low')) + stdev_hi = int(self.get_arg('stdev_frag_high')) + stdev_step = int(self.get_arg('stdev_frag_step')) + qs2_low = int(self.get_arg('qs2_low')) + qs2_hi = int(self.get_arg('qs2_high')) + qs2_step = int(self.get_arg('qs2_step')) + miseq_v = list() + [miseq_v.append(int(x.strip())) for x in self.get_arg('MiSeqVersions').split(",")] + coverarg = None + else: coverarg = coverage - cov = {} - if os.path.isfile(coverarg): + read_spec = {} + if coverarg is not None and os.path.isfile(coverarg): with open(coverarg) as infile: for lin in infile: seqnam = lin.split(',')[0] @@ -701,34 +736,44 @@ def run_art(self, coverage=None): except: sys.stderr.write("name {} in coverage file not found in tree\n".format(seqnam)) self._exit_handler() - cov[seqnam] = int(lin.split(',')[1]) + read_spec[seqnam] = int(lin.split(',')[1]) try: assert set(cov.keys()) == set(self.seqnames) except: - sys.stderr.write("some tips missing from coverage file: {}\n".format(set(self.seqnames) - cov.keys())) + sys.stderr.write("some tips missing from coverage file: {}\n".format(set(self.seqnames) - read_spec.keys())) self._exit_handler() else: for seqnam in self.seqnames: - cov[seqnam] = int(coverarg) + real_cov = random.sample(range(cov_low, cov_hi, cov_step), 1)[0] + real_frag = random.sample(range(frag_low, frag_hi, frag_step), 1)[0] + real_stdev = random.sample(range(stdev_low, stdev_hi, stdev_step), 1)[0] + real_qs2 = random.sample(range(qs2_low, qs2_hi, qs2_step), 1)[0] + real_miseq = random.sample(miseq_v, 1)[0] + read_spec[seqnam] = {'cov' : real_cov, 'frag' : real_frag, "stdev" : real_stdev, 'qs2' : real_qs2, 'ver' : real_miseq} if not os.path.isdir("{}/fastq".format(self.outd)): os.mkdir("{}/fastq".format(self.outd)) - sys.stdout.write("coverage is {}\n".format(self.config['coverage'])) if 'read_length' in self.config: read_length = self.config['read_length'] + if ',' in read_length: + r_l = list() + [r_l.append(x.strip()) for x in read_length.split(',')] + for seqnam in self.seqnames: + read_spec[seqnam]['rl'] = random.sample(r_l, 1)[0] + # read_length = random.sample(r_l, 1) + else: + for seqnam in self.seqnames: + read_spec[seqnam]['rl'] = read_length else: read_length = 150 - sys.stdout.write("read length is {}\n".format(read_length)) - if 'fragment_size' in self.config: - fragment_size = self.config['fragment_size'] - else: - fragment_size = 350 - sys.stdout.write("fragment size is {}\n".format(fragment_size)) - if 'stdev_frag_size' in self.config: - stdev_frag_size = self.config['stdev_frag_size'] - else: - stdev_frag_size = 130 - sys.stdout.write("stdev of frag size is {}\n".format(stdev_frag_size)) + for seqname in self.seqnames: + read_spec[seqnam]['rl'] = read_length for seq in self.seqnames: + sys.stdout.write("Coverage is {}\n".format(read_spec[seq]['cov'])) + sys.stdout.write("read length is {}\n".format(read_spec[seq]['rl'])) + sys.stdout.write("fragment size is {}\n".format(read_spec[seq]['frag'])) + sys.stdout.write("stdev of frag size is {}\n".format(read_spec[seq]['stdev'])) + sys.stdout.write("MiSeq Version is {}\n".format(read_spec[seq]['ver'])) + sys.stdout.write("QS2 shift {}\n".format(read_spec[seq]['qs2'])) sys.stdout.write("Generating reads for {}\n".format(seq)) if not os.path.isdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)): os.mkdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)) @@ -741,10 +786,12 @@ def run_art(self, coverage=None): '-na', #Don't output alignment file '-p', #for paired end reads '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), - '-l', '{}'.format(read_length), - '-f', str(cov[seq]), - '-m', '{}'.format(fragment_size), - '-s', '{}'.format(stdev_frag_size), + '-l', '{}'.format(read_spec[seq]['rl']), + '-f', str(read_spec[seq]['cov']), + '-m', '{}'.format(read_spec[seq]['frag']), + '-s', '{}'.format(read_spec[seq]['stdev']), + '-qs2', '-{}'.format(read_spec[seq]['qs2']), + '-ss', 'MSv{}'.format(read_spec[seq]['ver']), '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, self.prefix, seq, @@ -755,10 +802,12 @@ def run_art(self, coverage=None): '-p', #for paired end reads '-na', #Don't output alignment file '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), - '-l', '{}'.format(read_length), - '-f', str(cov[seq]), - '-m', '{}'.format(fragment_size), - '-s', '{}'.format(stdev_frag_size), + '-l', '{}'.format(read_spec[seq]['rl']), + '-f', str(read_spec[seq]['cov']), + '-m', '{}'.format(read_spec[seq]['frag']), + '-s', '{}'.format(read_spec[seq]['stdev']), + '-qs2', '-{}'.format(read_spec[seq]['qs2']), + '-ss', 'MSv{}'.format(read_spec[seq]['ver']), '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, self.prefix, seq, From f07cefb62d0abcaef61ab52d702dd9509bb51ac4 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Thu, 21 Nov 2019 15:33:43 -0500 Subject: [PATCH 2/7] Support more parameters for ART simulation, keep compat with old config --- treetoreads.py | 139 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 92 insertions(+), 47 deletions(-) diff --git a/treetoreads.py b/treetoreads.py index 9c61fb4..3e27406 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -9,7 +9,7 @@ import dendropy -VERSION = "0.0.5" +VERSION = "0.0.6-atc" class TreeToReads(object): """A tree to reads object that holds the input tree and base genome, @@ -43,7 +43,7 @@ def __init__(self, configfi, run=1, main=None): else: self.prefix = '' if self.run: - if (self.no_art == 0) and all([self.config.get(x) for x in ['coverage_low', 'coverage_high', 'coverage_step']]): + if (self.no_art == 0) and (all([self.config.get(x) for x in ['coverage_low', 'coverage_high', 'coverage_step']]) or (self.argdict['coverage'] is not None and int(self.argdict['coverage']) > 0 )): self.run_art() else: sys.stdout.write("simulating genomes but not reads\n") @@ -98,6 +98,7 @@ def read_args(self): 'base_genome_path', 'rate_matrix', 'freq_matrix', + 'coverage', 'coverage_low', 'coverage_high', 'coverage_step', @@ -111,9 +112,11 @@ def read_args(self): 'frag_low', 'frag_high', 'frag_step', + 'fragment_size', 'stdev_frag_low', 'stdev_frag_high', 'stdev_frag_step', + 'stdev_frag_size', 'qs2_low', 'qs2_high', 'qs2_step', @@ -121,6 +124,8 @@ def read_args(self): 'indel_model', 'indel_rate'] for lin in config: + if lin.startswith("#"): + continue lii = lin.split('=') self.config[lii[0].strip()] = lii[-1].split('#')[0].strip() if (lii[0].strip() != '') and (not lii[0].strip().startswith("#")) and (lii[0].strip() not in poss_args): @@ -163,19 +168,7 @@ def _check_args(self): 'genome':'base_genome_path', 'ratemat':'rate_matrix', 'outd':'output_dir', - 'coverage_low' : 'coverage_low', - 'coverage_high' : 'coverage_high', - 'coverage_step' : 'coverage_step', - 'read_length' : 'read_length', - 'frag_low' : 'frag_low', - 'frag_high' : 'frag_high', - 'frag_step' : 'frag_step', - 'stdev_frag_low' : 'stdev_frag_low', - 'stdev_frag_high' : 'stdev_frag_high', - 'stdev_frag_step' : 'stdev_frag_step', - 'qs2_low' : 'qs2_low', - 'qs2_high' : 'qs2_high', - 'qs2_step' : 'qs2_step', + 'read_length' : 'read_length' } for arg in self.argdict: if self.argdict[arg] not in self.config: @@ -249,7 +242,58 @@ def _check_args(self): else: sys.stdout.write('Mutation clustering is OFF\n') self.clustering = 0 - + read_args_list = ['coverage_low', 'coverage_high', 'coverage_step', 'read_length', + 'frag_low', 'frag_high', 'frag_step', 'stdev_frag_low', 'stdev_frag_high', + 'stdev_frag_step', 'qs2_low', 'qs2_high', 'qs2_step' + ] + for read_arg in read_args_list: + try: + self.argdict[read_arg] = self.get_arg(read_arg) + except: + pass + if all([self.argdict[x] is not None for x in ['coverage_low', 'coverage_high', 'coverage_step']]): + self.argdict['coverage'] = None + sys.stderr.write("Coverage ranges provided, using ({}, {}, {})\n".format( + self.argdict['coverage_low'], + self.argdict['coverage_high'], + self.argdict['coverage_step'])) + else: + self.argdict['coverage'] = self.get_arg('coverage') + sys.stderr.write("Using basic coverage mode, coverage = {}\n".format(self.argdict['coverage'])) + if all([self.argdict[x] is not None for x in ['frag_low', 'frag_high', 'frag_step']]): + self.argdict['fragment_size'] = None + sys.stderr.write("Fragment size ranges provided, using ({}, {}, {})\n".format( + self.argdict['frag_low'], + self.argdict['frag_high'], + self.argdict['frag_step'])) + else: + self.argdict['fragment_size'] = self.get_arg('fragment_size') + sys.stderr.write("Using basic fragment size mode\n") + if all([self.argdict[x] is not None for x in ['stdev_frag_low', 'stdev_frag_high', 'stdev_frag_step']]): + self.argdict['stdev_frag_size'] = None + sys.stderr.write("Fragment size stdev ranges provided, using ({}, {}, {})\n".format( + self.argdict['stdev_frag_low'], + self.argdict['stdev_frag_high'], + self.argdict['stdev_frag_step'])) + else: + self.argdict['stdev_frag_size'] = self.get_arg('stdev_frag_size') + sys.stderr.write("Using basic fragment size stdev mode\n") + if all([self.argdict[x] is not None for x in ['qs2_low', 'qs2_high', 'qs2_step']]): + sys.stderr.write("QS2 ranges provided, using ({}, {}, {})\n".format( + self.argdict['qs2_low'], + self.argdict['qs2_high'], + self.argdict['qs2_step'])) + else: + self.argdict['qs2'] = None + sys.stderr.write("No QS2 bias applied\n") + if self.get_arg('MiSeqVersions') is None: + self.argdict['MiSeqVersions'] = [3] + elif "," in self.argdict['MiSeqVersions']: + miseq_v = list() + [miseq_v.append(int(x.strip())) for x in self.argdict['MiSeqVersions'].split(",")] + self.argdict['MiSeqVersions'] = miseq_v + else: + self.argdict['MiSeqVersions'] = [self.argdict['MiSeqVersions']] def read_tree(self): """Reads in a tree from a file, arbitrarily resolves poltomies if present, strips leading [&U] and writes out to outputdir/simtree.tre""" @@ -707,49 +751,50 @@ def run_art(self, coverage=None): self.mut_genomes_indels() else: self.mut_genomes_no_indels() - if coverage is None: + read_spec = {} + if self.argdict['coverage'] is None: cov_low = int(self.get_arg('coverage_low')) cov_hi = int(self.get_arg('coverage_high')) cov_step = int(self.get_arg('coverage_step')) + for seqnam in self.seqnames: + real_cov = random.sample(range(cov_low, cov_hi, cov_step), 1)[0] + read_spec[seqnam] = {'cov' : real_cov} + else: + for seqnam in self.seqnames: + read_spec[seqnam] = {'cov' : self.argdict['coverage']} + if self.argdict['fragment_size'] is None: frag_low = int(self.get_arg('frag_low')) frag_hi = int(self.get_arg('frag_high')) frag_step = int(self.get_arg('frag_step')) + for seqnam in self.seqnames: + real_frag = random.sample(range(frag_low, frag_hi, frag_step), 1)[0] + read_spec[seqnam].update({'frag' : real_frag}) + else: + for seqnam in self.seqnames: + read_spec[seqnam].update({'frag' : self.argdict['fragment_size']}) + if self.argdict['stdev_frag_size'] is None: stdev_low = int(self.get_arg('stdev_frag_low')) stdev_hi = int(self.get_arg('stdev_frag_high')) stdev_step = int(self.get_arg('stdev_frag_step')) - qs2_low = int(self.get_arg('qs2_low')) - qs2_hi = int(self.get_arg('qs2_high')) - qs2_step = int(self.get_arg('qs2_step')) - miseq_v = list() - [miseq_v.append(int(x.strip())) for x in self.get_arg('MiSeqVersions').split(",")] - coverarg = None - - else: - coverarg = coverage - read_spec = {} - if coverarg is not None and os.path.isfile(coverarg): - with open(coverarg) as infile: - for lin in infile: - seqnam = lin.split(',')[0] - try: - assert seqnam in self.seqnames - except: - sys.stderr.write("name {} in coverage file not found in tree\n".format(seqnam)) - self._exit_handler() - read_spec[seqnam] = int(lin.split(',')[1]) - try: - assert set(cov.keys()) == set(self.seqnames) - except: - sys.stderr.write("some tips missing from coverage file: {}\n".format(set(self.seqnames) - read_spec.keys())) - self._exit_handler() - else: for seqnam in self.seqnames: - real_cov = random.sample(range(cov_low, cov_hi, cov_step), 1)[0] - real_frag = random.sample(range(frag_low, frag_hi, frag_step), 1)[0] real_stdev = random.sample(range(stdev_low, stdev_hi, stdev_step), 1)[0] + read_spec[seqnam].update({"stdev" : real_stdev}) + else: + for seqnam in self.seqnames: + read_spec[seqnam].update({'stdev' : self.argdict['stdev_frag_size']}) + if self.argdict['qs2'] is not None: + qs2_low = int(self.argdict['qs2_low']) + qs2_hi = int(self.argdict['qs2_high']) + qs2_step = int(self.argdict['qs2_step']) + for seqnam in self.seqnames: real_qs2 = random.sample(range(qs2_low, qs2_hi, qs2_step), 1)[0] - real_miseq = random.sample(miseq_v, 1)[0] - read_spec[seqnam] = {'cov' : real_cov, 'frag' : real_frag, "stdev" : real_stdev, 'qs2' : real_qs2, 'ver' : real_miseq} + read_spec[seqnam].update({'qs2' : real_qs2,}) + else: + for seqnam in self.seqnames: + read_spec[seqnam].update({'qs2' : 0}) + for seqnam in self.seqnames: + real_miseq = random.sample(self.argdict['MiSeqVersions'], 1)[0] + read_spec[seqnam].update({'ver' : real_miseq}) if not os.path.isdir("{}/fastq".format(self.outd)): os.mkdir("{}/fastq".format(self.outd)) if 'read_length' in self.config: From b587670b0729e7376233e859350f054cd256f9f9 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Fri, 22 Nov 2019 09:20:11 -0500 Subject: [PATCH 3/7] Better error handling --- example.config | 47 +++++++++++++++++-------- treetoreads.py | 94 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 99 insertions(+), 42 deletions(-) diff --git a/example.config b/example.config index d222ce5..7c75a8e 100644 --- a/example.config +++ b/example.config @@ -1,19 +1,36 @@ #REQUIRED PARAMETERS -treefile_path = example/example.tre #Must be newick or Nexus format, and include branch lengths -number_of_variable_sites = 20 -base_genome_name = gi #Should be the label of a tip in your tree -base_genome_path = example/mini_ref.fasta -output_dir = example_out - - +treefile_path = /storage/CompGenomics/CompGenomics2019/datasets/listeria/outbreak/listeria.nwk +number_of_variable_sites = 17 +base_genome_name = CGT3538 +base_genome_path = /storage/CompGenomics/CompGenomics2019/datasets/listeria/outbreak/ref.fa +output_dir = /storage/CompGenomics/Compgenomics2020/datasets/listeria/outbreak +#coverage_low = 23 # Start of random coverage range +#coverage_high = 87 # End of random coverage range +#coverage_step = 1 # Step size for binning random coverage +#read_length = 150,250 # Read length, can be one value or list of comma separated values +#frag_low = 600 # Start of random frag length range +#frag_high = 1000 # End of random frag length range +#frag_step = 7 # Step size for binning frag length +#stdev_frag_low = 10 # Start of random frag length stdev range +#stdev_frag_high = 100 # End of random frag length stdev range +#stdev_frag_step = 3 # Step size for binning frag length stdev +#qs2_low = -3 # Start of QS2 bias range +#qs2_high = 0 # End of QS2 bias range +#qs2_step = 1 # Step size for binning QS2 bias +readProfile = MSv3 # Read platform +# Supports the following (between read length) +# Defaults to MiSeq v3 to allow reads up to 250bp +# GA1 - GenomeAnalyzer I (1-44bp), GA2 - GenomeAnalyzer II (1-75bp) +# HS10 - HiSeq 1000 (1-100bp), HS20 - HiSeq 2000 1-100bp), HS25 - HiSeq 2500 (1-150bp) +# HSXn - HiSeqX PCR free (1-150bp), HSXt - HiSeqX TruSeq (1-150bp), MinS - MiniSeq TruSeq (1-50bp) +# MSv1 - MiSeq v1 (1-250bp), MSv3 - MiSeq v3 (1-250bp), NS50 - NextSeq500 v2 (1-75bp) #parameters of evolutionary model (comma seperated), in order ac, ag, at, cg, ct, gc (gc = 1) rate_matrix = 1,1,1,1,1,1 -#parameters for read simulation -coverage = 20 #either an integer or a file name of a comma delimited file with tip names and coverage + #OPTIONAL PARAMETERS -prefix = sim_ #optional prefix prepended to sequence names, default is using orginal sequence names +#prefix = sim_ #optional prefix prepended to sequence names, default is using orginal sequence names #Optional evolutionary model parameters gamma_shape = 5 #dafault is no rate variation across sites @@ -24,9 +41,9 @@ percent_clustered = 0.25 #The percentage of variable sites whose distance to ano exponential_mean = 125 #Minimum allowed value = 2 #ART Optional parameters (for more fine grained control ART can be run seperately on the mutated genomes found in outdir/fasta_files) -error_model1 = example/ErrprofR1.txt # If you haven't generated have one of your own using ART, you can use one supplied by ART. -error_model2 = example/ErrprofR2.txt # Un-comment these lines (delete the first #) to set a non-default error profile -read_length = 150 #maximum value with example error profile is 150, use a default or generate adifferent error profile for longer reads. -fragment_size = 380 +#error_model1 = example/ErrprofR1.txt # If you haven't generated have one of your own using ART, you can use one supplied by ART. +#error_model2 = example/ErrprofR2.txt # Un-comment these lines (delete the first #) to set a non-default error profile +read_length = 250,0 #maximum value with example error profile is 150, use a default or generate adifferent error profile for longer reads. +fragment_size = 750 stdev_frag_size = 120 - +coverage = 10 diff --git a/treetoreads.py b/treetoreads.py index 3e27406..59ef797 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -28,6 +28,10 @@ def __init__(self, configfi, run=1, main=None): self.seed = random.randint(0, sys.maxsize) sys.stdout.write("Random seed is {}\n".format(self.seed)) random.seed(self.seed) + self.validProfiles = {"GA1" : 44, "GA2" : 75, "HS10" : 100, + "HS20" : 100, "HS25" : 150, "HSXn" : 150, + "HSXt" : 150, "MinS" : 50, "MSv1" : 250, + "MSv3" : 250, "NS50" : 75} self.configfi = configfi self.run = run self.main = main @@ -43,7 +47,7 @@ def __init__(self, configfi, run=1, main=None): else: self.prefix = '' if self.run: - if (self.no_art == 0) and (all([self.config.get(x) for x in ['coverage_low', 'coverage_high', 'coverage_step']]) or (self.argdict['coverage'] is not None and int(self.argdict['coverage']) > 0 )): + if (not self.no_art and self.generate_reads) and (all([self.config.get(x) for x in ['coverage_low', 'coverage_high', 'coverage_step']]) or (self.argdict['coverage'] is not None and int(self.argdict['coverage']) > 0 )): self.run_art() else: sys.stdout.write("simulating genomes but not reads\n") @@ -74,9 +78,9 @@ def test_deps(self): and in your path for TreeToReads to generate reads. Art not found. TTR will only generate mutated genomes. \n''') - self.no_art = 1 + self.no_art = True else: - self.no_art = 0 + self.no_art = False if not dendropy.__version__.startswith('4'): sys.stderr.write('''ERROR: Please upgrade the python package dendropy to version 4, using 'pip install dendropy --upgrade'. @@ -120,7 +124,7 @@ def read_args(self): 'qs2_low', 'qs2_high', 'qs2_step', - 'MiSeqVersions', + 'readProfile', 'indel_model', 'indel_rate'] for lin in config: @@ -244,13 +248,15 @@ def _check_args(self): self.clustering = 0 read_args_list = ['coverage_low', 'coverage_high', 'coverage_step', 'read_length', 'frag_low', 'frag_high', 'frag_step', 'stdev_frag_low', 'stdev_frag_high', - 'stdev_frag_step', 'qs2_low', 'qs2_high', 'qs2_step' + 'stdev_frag_step', 'qs2_low', 'qs2_high', 'qs2_step', 'readProfile' ] + for read_arg in read_args_list: try: self.argdict[read_arg] = self.get_arg(read_arg) except: pass + if all([self.argdict[x] is not None for x in ['coverage_low', 'coverage_high', 'coverage_step']]): self.argdict['coverage'] = None sys.stderr.write("Coverage ranges provided, using ({}, {}, {})\n".format( @@ -260,6 +266,7 @@ def _check_args(self): else: self.argdict['coverage'] = self.get_arg('coverage') sys.stderr.write("Using basic coverage mode, coverage = {}\n".format(self.argdict['coverage'])) + if all([self.argdict[x] is not None for x in ['frag_low', 'frag_high', 'frag_step']]): self.argdict['fragment_size'] = None sys.stderr.write("Fragment size ranges provided, using ({}, {}, {})\n".format( @@ -269,6 +276,7 @@ def _check_args(self): else: self.argdict['fragment_size'] = self.get_arg('fragment_size') sys.stderr.write("Using basic fragment size mode\n") + if all([self.argdict[x] is not None for x in ['stdev_frag_low', 'stdev_frag_high', 'stdev_frag_step']]): self.argdict['stdev_frag_size'] = None sys.stderr.write("Fragment size stdev ranges provided, using ({}, {}, {})\n".format( @@ -278,6 +286,7 @@ def _check_args(self): else: self.argdict['stdev_frag_size'] = self.get_arg('stdev_frag_size') sys.stderr.write("Using basic fragment size stdev mode\n") + if all([self.argdict[x] is not None for x in ['qs2_low', 'qs2_high', 'qs2_step']]): sys.stderr.write("QS2 ranges provided, using ({}, {}, {})\n".format( self.argdict['qs2_low'], @@ -286,14 +295,49 @@ def _check_args(self): else: self.argdict['qs2'] = None sys.stderr.write("No QS2 bias applied\n") - if self.get_arg('MiSeqVersions') is None: - self.argdict['MiSeqVersions'] = [3] - elif "," in self.argdict['MiSeqVersions']: - miseq_v = list() - [miseq_v.append(int(x.strip())) for x in self.argdict['MiSeqVersions'].split(",")] - self.argdict['MiSeqVersions'] = miseq_v + + if self.argdict['readProfile'] is None: + self.argdict['readProfile'] = 'MSv3' + elif "," in self.argdict['readProfile']: + profList = list() + [profList.append(x.strip()) for x in self.argdict['readProfile'].split(",")] + self.argdict['readProfile'] = profList else: - self.argdict['MiSeqVersions'] = [self.argdict['MiSeqVersions']] + self.argdict['readProfile'] = [self.argdict['readProfile']] + + for profile in self.argdict['readProfile']: + if profile is not None and profile not in self.validProfiles: + sys.stderr.write("Invalid read profile provided, {}, setting to MSv3\n".format(profile)) + self.argdict['readProfile'][self.argdict['readProfile'].index(profile)] = 'MSv3' + + read_length_flag = True + self.generate_reads = True + if self.argdict['read_length'] is not None: + read_length = self.config['read_length'] + if ',' in read_length: + r_l = list() + [r_l.append(int(x.strip())) for x in read_length.split(',')] + read_length = r_l + else: + read_length = [int(read_length)] + for rl in read_length: + if rl == 0: + self.generate_reads = False + sys.stderr.write("Read length of 0 provided, will not generate reads\n".format(profile)) + break + if rl < 0: + sys.stderr.write("Read length <0 specified, cannot generate negative reads\n") + self._exit_handler() + for profile in self.argdict['readProfile']: + if rl > self.validProfiles[profile]: + read_length[read_length.index(rl)] = validProfiles[profile] + read_length_flag = True + self.argdict['read_length'] = read_length + + if read_length_flag: + sys.stderr.write("Read length greater than allowed by profile, setting to profile max\n".format(profile)) + + def read_tree(self): """Reads in a tree from a file, arbitrarily resolves poltomies if present, strips leading [&U] and writes out to outputdir/simtree.tre""" @@ -758,7 +802,8 @@ def run_art(self, coverage=None): cov_step = int(self.get_arg('coverage_step')) for seqnam in self.seqnames: real_cov = random.sample(range(cov_low, cov_hi, cov_step), 1)[0] - read_spec[seqnam] = {'cov' : real_cov} + real_len = random.sample(self.argdict['read_length'], 1)[0] + read_spec[seqnam] = {'cov' : real_cov, 'rl' : real_len} else: for seqnam in self.seqnames: read_spec[seqnam] = {'cov' : self.argdict['coverage']} @@ -793,21 +838,16 @@ def run_art(self, coverage=None): for seqnam in self.seqnames: read_spec[seqnam].update({'qs2' : 0}) for seqnam in self.seqnames: - real_miseq = random.sample(self.argdict['MiSeqVersions'], 1)[0] - read_spec[seqnam].update({'ver' : real_miseq}) + real_platform = random.sample(self.argdict['readProfile'], 1)[0] + read_spec[seqnam].update({'ver' : real_platform}) if not os.path.isdir("{}/fastq".format(self.outd)): os.mkdir("{}/fastq".format(self.outd)) - if 'read_length' in self.config: - read_length = self.config['read_length'] - if ',' in read_length: - r_l = list() - [r_l.append(x.strip()) for x in read_length.split(',')] - for seqnam in self.seqnames: - read_spec[seqnam]['rl'] = random.sample(r_l, 1)[0] - # read_length = random.sample(r_l, 1) - else: - for seqnam in self.seqnames: - read_spec[seqnam]['rl'] = read_length + # for seqnam in self.seqnames: + # read_spec[seqnam]['rl'] = random.sample(r_l, 1)[0] + # # read_length = random.sample(r_l, 1) + # else: + # for seqnam in self.seqnames: + # read_spec[seqnam]['rl'] = read_length else: read_length = 150 for seqname in self.seqnames: @@ -852,7 +892,7 @@ def run_art(self, coverage=None): '-m', '{}'.format(read_spec[seq]['frag']), '-s', '{}'.format(read_spec[seq]['stdev']), '-qs2', '-{}'.format(read_spec[seq]['qs2']), - '-ss', 'MSv{}'.format(read_spec[seq]['ver']), + '-ss', '{}'.format(read_spec[seq]['ver']), '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, self.prefix, seq, From ce6248e1af42f389f209544a6f863fc88c10ae14 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Mon, 25 Nov 2019 08:41:54 -0500 Subject: [PATCH 4/7] Finish threading integration --- example.config | 80 ++++++++++-------- treetoreads.py | 215 ++++++++++++++++++++++++++----------------------- 2 files changed, 164 insertions(+), 131 deletions(-) diff --git a/example.config b/example.config index 7c75a8e..28fa7e2 100644 --- a/example.config +++ b/example.config @@ -1,36 +1,16 @@ #REQUIRED PARAMETERS -treefile_path = /storage/CompGenomics/CompGenomics2019/datasets/listeria/outbreak/listeria.nwk -number_of_variable_sites = 17 -base_genome_name = CGT3538 -base_genome_path = /storage/CompGenomics/CompGenomics2019/datasets/listeria/outbreak/ref.fa -output_dir = /storage/CompGenomics/Compgenomics2020/datasets/listeria/outbreak -#coverage_low = 23 # Start of random coverage range -#coverage_high = 87 # End of random coverage range -#coverage_step = 1 # Step size for binning random coverage -#read_length = 150,250 # Read length, can be one value or list of comma separated values -#frag_low = 600 # Start of random frag length range -#frag_high = 1000 # End of random frag length range -#frag_step = 7 # Step size for binning frag length -#stdev_frag_low = 10 # Start of random frag length stdev range -#stdev_frag_high = 100 # End of random frag length stdev range -#stdev_frag_step = 3 # Step size for binning frag length stdev -#qs2_low = -3 # Start of QS2 bias range -#qs2_high = 0 # End of QS2 bias range -#qs2_step = 1 # Step size for binning QS2 bias -readProfile = MSv3 # Read platform -# Supports the following (between read length) -# Defaults to MiSeq v3 to allow reads up to 250bp -# GA1 - GenomeAnalyzer I (1-44bp), GA2 - GenomeAnalyzer II (1-75bp) -# HS10 - HiSeq 1000 (1-100bp), HS20 - HiSeq 2000 1-100bp), HS25 - HiSeq 2500 (1-150bp) -# HSXn - HiSeqX PCR free (1-150bp), HSXt - HiSeqX TruSeq (1-150bp), MinS - MiniSeq TruSeq (1-50bp) -# MSv1 - MiSeq v1 (1-250bp), MSv3 - MiSeq v3 (1-250bp), NS50 - NextSeq500 v2 (1-75bp) -#parameters of evolutionary model (comma seperated), in order ac, ag, at, cg, ct, gc (gc = 1) +treefile_path = example/example.tre +number_of_variable_sites = 20 +base_genome_name = gi +base_genome_path = example/mini_ref.fasta +output_dir = example_out +# parameters of evolutionary model (comma seperated), in order ac, ag, at, cg, ct, gc (gc = 1) rate_matrix = 1,1,1,1,1,1 -#OPTIONAL PARAMETERS -#prefix = sim_ #optional prefix prepended to sequence names, default is using orginal sequence names +# OPTIONAL PARAMETERS +prefix = sim_ #optional prefix prepended to sequence names, default is using orginal sequence names #Optional evolutionary model parameters gamma_shape = 5 #dafault is no rate variation across sites @@ -40,10 +20,46 @@ mutation_clustering = ON percent_clustered = 0.25 #The percentage of variable sites whose distance to another site is drawn from the clustering distribution exponential_mean = 125 #Minimum allowed value = 2 -#ART Optional parameters (for more fine grained control ART can be run seperately on the mutated genomes found in outdir/fasta_files) +## ART Optional parameters (for more fine grained control ART can be run seperately on the mutated genomes found in outdir/fasta_files) #error_model1 = example/ErrprofR1.txt # If you haven't generated have one of your own using ART, you can use one supplied by ART. #error_model2 = example/ErrprofR2.txt # Un-comment these lines (delete the first #) to set a non-default error profile -read_length = 250,0 #maximum value with example error profile is 150, use a default or generate adifferent error profile for longer reads. -fragment_size = 750 + +## Read length, can be one value or list of comma separated values up to 250bp +## Read length may be truncated if selected read platform doesn't support it +read_length = 250 + +## Insert fragment length. Can be uniform [fragment_size] +## or randomly distributed between [frag_low, frag_high] +fragment_size = 380 +# frag_low = 600 # Start of random frag length range +# frag_high = 1000 # End of random frag length range +# frag_step = 7 # Step size for binning frag length + +## Insert fragment length standard deviation. Can be uniform [stdev_frag_size] +## or randomly distributed between [stdev_frag_low, stdev_frag_high] stdev_frag_size = 120 -coverage = 10 +# stdev_frag_low = 10 # Start of random frag length stdev range +# stdev_frag_high = 100 # End of random frag length stdev range +# stdev_frag_step = 3 # Step size for binning frag length stdev + +## Average genomic real coverage. Can be uniform [coverage] across dataset +## or randomly distributed between [coverage_low, coverage_high] +coverage = 20 +# coverage_low = 20 # Start of random coverage range +# coverage_high = 87 # End of random coverage range +# coverage_step = 1 # Step size for binning random coverage + +## Average QS2 bias to introduce. Can be 0 (no value provided) across dataset +# or randomly distributed between [qs2_low, qs2_high] +# qs2_low = -3 # Start of QS2 bias range +# qs2_high = 0 # End of QS2 bias range +# qs2_step = 1 # Step size for binning QS2 bias + +# Platform read error profile used to generate reads +# readProfile = MSv3 # Read platform +## Supports the following (between read length) +## Defaults to MiSeq v3 to allow reads up to 250bp +## GA1 - GenomeAnalyzer I (1-44bp), GA2 - GenomeAnalyzer II (1-75bp) +## HS10 - HiSeq 1000 (1-100bp), HS20 - HiSeq 2000 1-100bp), HS25 - HiSeq 2500 (1-150bp) +## HSXn - HiSeqX PCR free (1-150bp), HSXt - HiSeqX TruSeq (1-150bp), MinS - MiniSeq TruSeq (1-50bp) +## MSv1 - MiSeq v1 (1-250bp), MSv3 - MiSeq v3 (1-250bp), NS50 - NextSeq500 v2 (1-75bp) \ No newline at end of file diff --git a/treetoreads.py b/treetoreads.py index 59ef797..ad749e8 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -1,15 +1,16 @@ #!/usr/bin/env python """Tree to Reads - A python script to to read a tree, resolve polytomies, generate mutations and simulate reads.""" -import random +import argparse import os +import random import sys -import argparse +from multiprocessing import Pool from subprocess import call -import dendropy +import dendropy -VERSION = "0.0.6-atc" +VERSION = "0.0.6" class TreeToReads(object): """A tree to reads object that holds the input tree and base genome, @@ -86,6 +87,10 @@ def test_deps(self): using 'pip install dendropy --upgrade'. Exiting\n''') self._exit_handler() + if call(['which', 'pigz'], stdout=open('/dev/null', 'w')) == 1: + self.gzipProg = "pigz" + else: + self.gzipProg = "gzip" def read_args(self): """reads arguments from config file""" @@ -126,7 +131,8 @@ def read_args(self): 'qs2_step', 'readProfile', 'indel_model', - 'indel_rate'] + 'indel_rate', + 'threads'] for lin in config: if lin.startswith("#"): continue @@ -178,6 +184,17 @@ def _check_args(self): if self.argdict[arg] not in self.config: sys.stderr.write("{} is missing from the config file".format(self.argdict[arg])) self._exit_handler() + thread_cnt = self.get_arg('threads') + if thread_cnt is None or thread_cnt == '0' or thread_cnt == '1': + self.threads = None + sys.stderr.write("Using 1 thread for read simulation\n") + else: + try: + self.threads = int(self.get_arg('threads')) + sys.stderr.write("Using {} threads for read simulation\n".format(self.threads)) + except TypeError: + sys.stderr.write("Non-integer thread argument provided") + self._exit_handler try: self.nsnp = int(self.get_arg('nsnp')) sys.stdout.write('Number of variable sites is {}\n'.format(self.nsnp)) @@ -259,13 +276,15 @@ def _check_args(self): if all([self.argdict[x] is not None for x in ['coverage_low', 'coverage_high', 'coverage_step']]): self.argdict['coverage'] = None + self.generate_reads = True sys.stderr.write("Coverage ranges provided, using ({}, {}, {})\n".format( self.argdict['coverage_low'], self.argdict['coverage_high'], self.argdict['coverage_step'])) else: self.argdict['coverage'] = self.get_arg('coverage') - sys.stderr.write("Using basic coverage mode, coverage = {}\n".format(self.argdict['coverage'])) + self.generate_reads = True + sys.stderr.write("Using uniform coverage, coverage = {}\n".format(self.argdict['coverage'])) if all([self.argdict[x] is not None for x in ['frag_low', 'frag_high', 'frag_step']]): self.argdict['fragment_size'] = None @@ -275,7 +294,7 @@ def _check_args(self): self.argdict['frag_step'])) else: self.argdict['fragment_size'] = self.get_arg('fragment_size') - sys.stderr.write("Using basic fragment size mode\n") + sys.stderr.write("Using uniform fragment size\n") if all([self.argdict[x] is not None for x in ['stdev_frag_low', 'stdev_frag_high', 'stdev_frag_step']]): self.argdict['stdev_frag_size'] = None @@ -285,7 +304,7 @@ def _check_args(self): self.argdict['stdev_frag_step'])) else: self.argdict['stdev_frag_size'] = self.get_arg('stdev_frag_size') - sys.stderr.write("Using basic fragment size stdev mode\n") + sys.stderr.write("Using fixed-size stdev\n") if all([self.argdict[x] is not None for x in ['qs2_low', 'qs2_high', 'qs2_step']]): sys.stderr.write("QS2 ranges provided, using ({}, {}, {})\n".format( @@ -782,6 +801,61 @@ def mut_genomes_indels(self):#TODO does not account for SNPs in indsertions # write_vcf(self) sys.stdout.write("Mutated genomes\n") + def simulate_reads(self, seq): + if not os.path.isdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)): + os.mkdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)) + if self.get_arg('error_model1') and self.get_arg('error_model2'): + assert os.path.exists(self.get_arg('error_model1')) + assert os.path.exists(self.get_arg('error_model2')) + artparam = ['art_illumina', + '-1', self.get_arg('error_model1'), + '-2', self.get_arg('error_model2'), + '-na', #Don't output alignment file + '-p', #for paired end reads + '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), + '-l', '{}'.format(self.read_spec[seq]['rl']), + '-f', str(self.read_spec[seq]['cov']), + '-m', '{}'.format(self.read_spec[seq]['frag']), + '-s', '{}'.format(self.read_spec[seq]['stdev']), + '-qs2', '-{}'.format(self.read_spec[seq]['qs2']), + '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq)] + else: + artparam = ['art_illumina', + '-p', #for paired end reads + '-na', #Don't output alignment file + '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), + '-l', '{}'.format(self.read_spec[seq]['rl']), + '-f', str(self.read_spec[seq]['cov']), + '-m', '{}'.format(self.read_spec[seq]['frag']), + '-s', '{}'.format(self.read_spec[seq]['stdev']), + '-qs2', '-{}'.format(self.read_spec[seq]['qs2']), + '-ss', '{}'.format(self.read_spec[seq]['ver']), + '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq)] + call(artparam, stdout=open('{}/art_log'.format(self.outd), 'w'), stderr=open('{}/art_log'.format(self.outd), 'a')) + # print("called {}".format(" ".join(artparam))) + assert os.path.exists('{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, self.prefix, seq, self.prefix, seq)) + if self.gzip: + gzippar = [self.gzipProg, + '-f', + '{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq), + '{}/fastq/{}{}/{}{}_2.fq'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq)] + call(gzippar) def run_art(self, coverage=None): """Runs ART to simulate reads from the simulated genomes""" @@ -795,130 +869,75 @@ def run_art(self, coverage=None): self.mut_genomes_indels() else: self.mut_genomes_no_indels() - read_spec = {} + self.read_spec = {} if self.argdict['coverage'] is None: - cov_low = int(self.get_arg('coverage_low')) - cov_hi = int(self.get_arg('coverage_high')) - cov_step = int(self.get_arg('coverage_step')) + cov_low = int(self.argdict['coverage_low']) + cov_hi = int(self.argdict['coverage_high']) + cov_step = int(self.argdict['coverage_step']) for seqnam in self.seqnames: real_cov = random.sample(range(cov_low, cov_hi, cov_step), 1)[0] real_len = random.sample(self.argdict['read_length'], 1)[0] - read_spec[seqnam] = {'cov' : real_cov, 'rl' : real_len} + self.read_spec[seqnam] = {'cov' : real_cov, 'rl' : real_len} else: for seqnam in self.seqnames: - read_spec[seqnam] = {'cov' : self.argdict['coverage']} + self.read_spec[seqnam] = {'cov' : self.argdict['coverage']} if self.argdict['fragment_size'] is None: - frag_low = int(self.get_arg('frag_low')) - frag_hi = int(self.get_arg('frag_high')) - frag_step = int(self.get_arg('frag_step')) + frag_low = int(self.argdict['frag_low']) + frag_hi = int(self.argdict['frag_high']) + frag_step = int(self.argdict['frag_step']) for seqnam in self.seqnames: real_frag = random.sample(range(frag_low, frag_hi, frag_step), 1)[0] - read_spec[seqnam].update({'frag' : real_frag}) + self.read_spec[seqnam].update({'frag' : real_frag}) else: for seqnam in self.seqnames: - read_spec[seqnam].update({'frag' : self.argdict['fragment_size']}) + self.read_spec[seqnam].update({'frag' : self.argdict['fragment_size']}) if self.argdict['stdev_frag_size'] is None: - stdev_low = int(self.get_arg('stdev_frag_low')) - stdev_hi = int(self.get_arg('stdev_frag_high')) - stdev_step = int(self.get_arg('stdev_frag_step')) + stdev_low = int(self.argdict['stdev_frag_low']) + stdev_hi = int(self.argdict['stdev_frag_high']) + stdev_step = int(self.argdict['stdev_frag_step']) for seqnam in self.seqnames: real_stdev = random.sample(range(stdev_low, stdev_hi, stdev_step), 1)[0] - read_spec[seqnam].update({"stdev" : real_stdev}) + self.read_spec[seqnam].update({"stdev" : real_stdev}) else: for seqnam in self.seqnames: - read_spec[seqnam].update({'stdev' : self.argdict['stdev_frag_size']}) + self.read_spec[seqnam].update({'stdev' : self.argdict['stdev_frag_size']}) if self.argdict['qs2'] is not None: qs2_low = int(self.argdict['qs2_low']) qs2_hi = int(self.argdict['qs2_high']) qs2_step = int(self.argdict['qs2_step']) for seqnam in self.seqnames: real_qs2 = random.sample(range(qs2_low, qs2_hi, qs2_step), 1)[0] - read_spec[seqnam].update({'qs2' : real_qs2,}) + self.read_spec[seqnam].update({'qs2' : real_qs2,}) else: for seqnam in self.seqnames: - read_spec[seqnam].update({'qs2' : 0}) + self.read_spec[seqnam].update({'qs2' : 0}) for seqnam in self.seqnames: real_platform = random.sample(self.argdict['readProfile'], 1)[0] - read_spec[seqnam].update({'ver' : real_platform}) + self.read_spec[seqnam].update({'ver' : real_platform}) if not os.path.isdir("{}/fastq".format(self.outd)): os.mkdir("{}/fastq".format(self.outd)) - # for seqnam in self.seqnames: - # read_spec[seqnam]['rl'] = random.sample(r_l, 1)[0] - # # read_length = random.sample(r_l, 1) - # else: - # for seqnam in self.seqnames: - # read_spec[seqnam]['rl'] = read_length else: read_length = 150 for seqname in self.seqnames: - read_spec[seqnam]['rl'] = read_length - for seq in self.seqnames: - sys.stdout.write("Coverage is {}\n".format(read_spec[seq]['cov'])) - sys.stdout.write("read length is {}\n".format(read_spec[seq]['rl'])) - sys.stdout.write("fragment size is {}\n".format(read_spec[seq]['frag'])) - sys.stdout.write("stdev of frag size is {}\n".format(read_spec[seq]['stdev'])) - sys.stdout.write("MiSeq Version is {}\n".format(read_spec[seq]['ver'])) - sys.stdout.write("QS2 shift {}\n".format(read_spec[seq]['qs2'])) - sys.stdout.write("Generating reads for {}\n".format(seq)) - if not os.path.isdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)): - os.mkdir("{}/fastq/{}{}".format(self.outd, self.prefix, seq)) - if self.get_arg('error_model1') and self.get_arg('error_model2'): - assert os.path.exists(self.get_arg('error_model1')) - assert os.path.exists(self.get_arg('error_model2')) - artparam = ['art_illumina', - '-1', self.get_arg('error_model1'), - '-2', self.get_arg('error_model2'), - '-na', #Don't output alignment file - '-p', #for paired end reads - '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), - '-l', '{}'.format(read_spec[seq]['rl']), - '-f', str(read_spec[seq]['cov']), - '-m', '{}'.format(read_spec[seq]['frag']), - '-s', '{}'.format(read_spec[seq]['stdev']), - '-qs2', '-{}'.format(read_spec[seq]['qs2']), - '-ss', 'MSv{}'.format(read_spec[seq]['ver']), - '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq)] - else: - artparam = ['art_illumina', - '-p', #for paired end reads - '-na', #Don't output alignment file - '-i', '{}/fasta_files/{}{}.fasta'.format(self.outd, self.prefix, seq), - '-l', '{}'.format(read_spec[seq]['rl']), - '-f', str(read_spec[seq]['cov']), - '-m', '{}'.format(read_spec[seq]['frag']), - '-s', '{}'.format(read_spec[seq]['stdev']), - '-qs2', '-{}'.format(read_spec[seq]['qs2']), - '-ss', '{}'.format(read_spec[seq]['ver']), - '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq)] - call(artparam, stdout=open('{}/art_log'.format(self.outd), 'w'), stderr=open('{}/art_log'.format(self.outd), 'a')) - # print("called {}".format(" ".join(artparam))) - assert os.path.exists('{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, self.prefix, seq, self.prefix, seq)) - gzippar = ['gzip', - '-f', - '{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq), - '{}/fastq/{}{}/{}{}_2.fq'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq)] - call(gzippar) + self.read_spec[seqnam]['rl'] = read_length + if self.threads is None: + for seq in self.seqnames: + sys.stdout.write("Coverage is {}\n".format(self.read_spec[seq]['cov'])) + sys.stdout.write("read length is {}\n".format(self.read_spec[seq]['rl'])) + sys.stdout.write("fragment size is {}\n".format(self.read_spec[seq]['frag'])) + sys.stdout.write("stdev of frag size is {}\n".format(self.read_spec[seq]['stdev'])) + sys.stdout.write("MiSeq Version is {}\n".format(self.read_spec[seq]['ver'])) + sys.stdout.write("QS2 shift {}\n".format(self.read_spec[seq]['qs2'])) + sys.stdout.write("Generating reads for {}\n".format(seq)) + self.simulate_reads(seq) + else: + sys.stdout.write("Simulating FASTQ reads") + with Pool(processes=self.threads) as pool: + results = pool.map(self.simulate_reads, self.seqnames) sys.stdout.write("TreeToReads completed successfully!\n") - def write_indelible_controlfile(outputdir, ratemat, freqmat, indelmodel, indelrate, tree, seqlen, seed): """Writes a control file for indelible to run indelible GTR is specified as ct', 'at', 'gt', 'ac', 'cg', 'ag' =1 @@ -1070,5 +1089,3 @@ def write_vcf(ttrobj): version='Tree to reads version {}'.format(VERSION)) args = parser.parse_args() ttr = TreeToReads(configfi=args.config_file, main=1) - - From 37f7a17d4c54e66b479316c46383e72fd7a04bbf Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Mon, 25 Nov 2019 09:05:02 -0500 Subject: [PATCH 5/7] Forgot threads config param --- example.config | 1 + treetoreads.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/example.config b/example.config index 28fa7e2..cb03d19 100644 --- a/example.config +++ b/example.config @@ -11,6 +11,7 @@ rate_matrix = 1,1,1,1,1,1 # OPTIONAL PARAMETERS prefix = sim_ #optional prefix prepended to sequence names, default is using orginal sequence names +threads = 10 #Optional evolutionary model parameters gamma_shape = 5 #dafault is no rate variation across sites diff --git a/treetoreads.py b/treetoreads.py index ad749e8..84afae5 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -817,7 +817,7 @@ def simulate_reads(self, seq): '-f', str(self.read_spec[seq]['cov']), '-m', '{}'.format(self.read_spec[seq]['frag']), '-s', '{}'.format(self.read_spec[seq]['stdev']), - '-qs2', '-{}'.format(self.read_spec[seq]['qs2']), + '-qs2', '{}'.format(self.read_spec[seq]['qs2']), '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, self.prefix, seq, @@ -832,7 +832,7 @@ def simulate_reads(self, seq): '-f', str(self.read_spec[seq]['cov']), '-m', '{}'.format(self.read_spec[seq]['frag']), '-s', '{}'.format(self.read_spec[seq]['stdev']), - '-qs2', '-{}'.format(self.read_spec[seq]['qs2']), + '-qs2', '{}'.format(self.read_spec[seq]['qs2']), '-ss', '{}'.format(self.read_spec[seq]['ver']), '-o', '{}/fastq/{}{}/{}{}_'.format(self.outd, self.prefix, From 19188d23cca45ce095c6c8df7eabeff04be8123a Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Thu, 5 Dec 2019 14:51:55 -0500 Subject: [PATCH 6/7] Tidying up some code and fix for example config --- example.config | 4 ++-- treetoreads.py | 45 +++++++++++++++++++++------------------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/example.config b/example.config index cb03d19..3b91980 100644 --- a/example.config +++ b/example.config @@ -27,7 +27,7 @@ exponential_mean = 125 #Minimum allowed value = 2 ## Read length, can be one value or list of comma separated values up to 250bp ## Read length may be truncated if selected read platform doesn't support it -read_length = 250 +read_length = 150,250 ## Insert fragment length. Can be uniform [fragment_size] ## or randomly distributed between [frag_low, frag_high] @@ -57,7 +57,7 @@ coverage = 20 # qs2_step = 1 # Step size for binning QS2 bias # Platform read error profile used to generate reads -# readProfile = MSv3 # Read platform +readProfile = MSv3 # Read platform ## Supports the following (between read length) ## Defaults to MiSeq v3 to allow reads up to 250bp ## GA1 - GenomeAnalyzer I (1-44bp), GA2 - GenomeAnalyzer II (1-75bp) diff --git a/treetoreads.py b/treetoreads.py index 84afae5..ed7d7ee 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -182,7 +182,7 @@ def _check_args(self): } for arg in self.argdict: if self.argdict[arg] not in self.config: - sys.stderr.write("{} is missing from the config file".format(self.argdict[arg])) + sys.stderr.write("{} is missing from the config file\n".format(self.argdict[arg])) self._exit_handler() thread_cnt = self.get_arg('threads') if thread_cnt is None or thread_cnt == '0' or thread_cnt == '1': @@ -316,7 +316,7 @@ def _check_args(self): sys.stderr.write("No QS2 bias applied\n") if self.argdict['readProfile'] is None: - self.argdict['readProfile'] = 'MSv3' + self.argdict['readProfile'] = ['MSv3'] elif "," in self.argdict['readProfile']: profList = list() [profList.append(x.strip()) for x in self.argdict['readProfile'].split(",")] @@ -334,8 +334,7 @@ def _check_args(self): if self.argdict['read_length'] is not None: read_length = self.config['read_length'] if ',' in read_length: - r_l = list() - [r_l.append(int(x.strip())) for x in read_length.split(',')] + r_l = [int(x.strip()) for x in read_length.split(',')] read_length = r_l else: read_length = [int(read_length)] @@ -351,7 +350,9 @@ def _check_args(self): if rl > self.validProfiles[profile]: read_length[read_length.index(rl)] = validProfiles[profile] read_length_flag = True - self.argdict['read_length'] = read_length + self.argdict['read_length'] = read_length + else: + self.generate_reads if read_length_flag: sys.stderr.write("Read length greater than allowed by profile, setting to profile max\n".format(profile)) @@ -842,20 +843,19 @@ def simulate_reads(self, seq): call(artparam, stdout=open('{}/art_log'.format(self.outd), 'w'), stderr=open('{}/art_log'.format(self.outd), 'a')) # print("called {}".format(" ".join(artparam))) assert os.path.exists('{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, self.prefix, seq, self.prefix, seq)) - if self.gzip: - gzippar = [self.gzipProg, - '-f', - '{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq), - '{}/fastq/{}{}/{}{}_2.fq'.format(self.outd, - self.prefix, - seq, - self.prefix, - seq)] - call(gzippar) + gzippar = [self.gzipProg, + '-f', + '{}/fastq/{}{}/{}{}_1.fq'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq), + '{}/fastq/{}{}/{}{}_2.fq'.format(self.outd, + self.prefix, + seq, + self.prefix, + seq)] + call(gzippar) def run_art(self, coverage=None): """Runs ART to simulate reads from the simulated genomes""" @@ -880,7 +880,8 @@ def run_art(self, coverage=None): self.read_spec[seqnam] = {'cov' : real_cov, 'rl' : real_len} else: for seqnam in self.seqnames: - self.read_spec[seqnam] = {'cov' : self.argdict['coverage']} + real_len = random.sample(self.argdict['read_length'], 1)[0] + self.read_spec[seqnam] = {'cov' : self.argdict['coverage'], 'rl' : real_len} if self.argdict['fragment_size'] is None: frag_low = int(self.argdict['frag_low']) frag_hi = int(self.argdict['frag_high']) @@ -916,10 +917,6 @@ def run_art(self, coverage=None): self.read_spec[seqnam].update({'ver' : real_platform}) if not os.path.isdir("{}/fastq".format(self.outd)): os.mkdir("{}/fastq".format(self.outd)) - else: - read_length = 150 - for seqname in self.seqnames: - self.read_spec[seqnam]['rl'] = read_length if self.threads is None: for seq in self.seqnames: sys.stdout.write("Coverage is {}\n".format(self.read_spec[seq]['cov'])) From 6a4c331c0e5df344fd97f3a9c9b342411ebf1722 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Tue, 21 Jan 2020 08:16:01 -0500 Subject: [PATCH 7/7] Restructure pool to enable py2 and 3 compat --- treetoreads.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/treetoreads.py b/treetoreads.py index ed7d7ee..ea01ba3 100755 --- a/treetoreads.py +++ b/treetoreads.py @@ -67,6 +67,8 @@ def _exit_handler(self): sys.exit() else: pass + def __call__(self, x): + return self.simulate_reads(x) def test_deps(self): "Check that seq-gen and ART are installed," if call(['which', 'seq-gen'], stdout=open('/dev/null', 'w')) == 1: @@ -87,7 +89,7 @@ def test_deps(self): using 'pip install dendropy --upgrade'. Exiting\n''') self._exit_handler() - if call(['which', 'pigz'], stdout=open('/dev/null', 'w')) == 1: + if call(['which', 'pigz'], stdout=open('/dev/null', 'w')) == 0: self.gzipProg = "pigz" else: self.gzipProg = "gzip" @@ -928,9 +930,9 @@ def run_art(self, coverage=None): sys.stdout.write("Generating reads for {}\n".format(seq)) self.simulate_reads(seq) else: - sys.stdout.write("Simulating FASTQ reads") - with Pool(processes=self.threads) as pool: - results = pool.map(self.simulate_reads, self.seqnames) + sys.stdout.write("Simulating FASTQ reads\n") + pool = Pool(processes=self.threads) + results = pool.map(self, self.seqnames) sys.stdout.write("TreeToReads completed successfully!\n")