From 73f6808109ed363ed7fa5f8c9f0043cac15c2b88 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Wed, 19 Feb 2020 13:19:22 -0700 Subject: [PATCH 1/2] proper simulations --- src/strpkg/call.nim | 142 ++++++++++++++++++++--------------------- src/strpkg/cluster.nim | 10 ++- 2 files changed, 80 insertions(+), 72 deletions(-) diff --git a/src/strpkg/call.nim b/src/strpkg/call.nim index 84fbe0d..1bb29b9 100644 --- a/src/strpkg/call.nim +++ b/src/strpkg/call.nim @@ -24,6 +24,24 @@ import ./unpack export tread export Soft +const n_sims = 100 +const each_n = 2 +proc string_hash(s:string, sim_i:int): uint64 {.inline.} = + # djb's string hash + result = 5381 + for c in s: + result = ((result shl 5) + result) + c.uint64 + + for c in $sim_i: + result = ((result shl 5) + result) + c.uint64 + +proc filter_sims[T:tread|Support](trs: seq[T], sim_i: int, each_n:int): seq[T] = + result = trs + if result.len == 0: return + randomize(sim_i + 1) + shuffle(result) + result = result[0.. 1000'u32: - stderr.write_line "large bounds:" & $bound & " skipping" - continue - var (spans, median_depth) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq) - if spans.len > 5_000: - when defined(debug): - stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping." - continue - if median_depth == -1: - continue - - var gt = genotype(bound, str_reads, spans, opts, float(median_depth)) - - var canon_repeat = bound.repeat.canonical_repeat - if not genotypes_by_repeat.hasKey(canon_repeat): - genotypes_by_repeat[canon_repeat] = @[] - genotypes_by_repeat[canon_repeat].add(gt) - - #var estimate = spans.estimate_size(frag_dist) - bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth - when defined(debug): - for s in spans: - span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name) - var locusid = bound.id(opts.targets) - for r in str_reads: - reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid - - # Cluster remaining reads and genotype - var ci = 0 - - for key, treads in treads_by_tid_rep.mpairs: - ## treads are for the single tid, repeat unit defined by key - for c in treads.cluster(max_dist=opts.window.uint32, min_supporting_reads=opts.min_support): - if c.reads[0].tid == -1: - unplaced_counts[c.reads[0].repeat.tostring] = c.reads.len - continue + t0 = cpuTime() + + var spansByBound = initTable[Bounds, seq[Support]]() + var treadsByBound = initTable[Bounds, seq[tread]]() + var medByBound = initTable[Bounds, int]() + for sim_i in 0.. 1000'u32: + stderr.write_line "large bounds:" & $bound & " skipping" continue - var (spans, median_depth) = ibam.spanners(b, opts.window, frag_dist, opts.min_mapq) + if bound notin treadsByBound: + treadsByBound[bound] = assign_reads_locus(bound, treads_by_tid_rep) + var str_reads = treadsByBound[bound].filter_sims(sim_i, each_n) + + if bound notin spansByBound: + var (s, md) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq) + spansByBound[bound] = s + medByBound[bound] = md + + var spans = spansByBound[bound].filter_sims(sim_i, each_n) + var median_depth = medByBound[bound] + if spans.len > 5_000: when defined(debug): - stderr.write_line &"High depth for bound {opts.targets[b.tid].name}:{b.left}-{b.right} got {spans.len} pairs. Skipping." + stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping." continue if median_depth == -1: continue - var gt = genotype(b, c.reads, spans, opts, float(median_depth)) + var gt = genotype(bound, str_reads, spans, opts, float(median_depth)) - var canon_repeat = b.repeat.canonical_repeat + var canon_repeat = bound.repeat.canonical_repeat if not genotypes_by_repeat.hasKey(canon_repeat): genotypes_by_repeat[canon_repeat] = @[] genotypes_by_repeat[canon_repeat].add(gt) #var estimate = spans.estimate_size(frag_dist) - bounds_fh.write_line b.tostring(opts.targets) & "\t" & $median_depth - + bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth when defined(debug): for s in spans: - span_fh.write_line s.tostring(b, opts.targets[b.tid].name) - for s in c.reads: - reads_fh.write_line s.tostring(opts.targets) & "\t" & $ci - ci += 1 - - # Loop through again and refine genotypes for loci that are the only - # large expansion with that repeat unit - for repeat, genotypes in genotypes_by_repeat: - var gt_expanded: seq[Call] - for gt in genotypes: - if gt.is_large: - gt_expanded.add(gt) - if gt_expanded.len > 1: - break - if gt_expanded.len == 1: - gt_expanded[0].update_genotype(unplaced_counts[repeat]) - for gt in genotypes: - gt_fh.write_line gt.tostring() - - for repeat, count in unplaced_counts: - unplaced_fh.write_line &"{repeat}\t{count}" + span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name) + var locusid = bound.id(opts.targets) + for r in str_reads: + reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid + + # Loop through again and refine genotypes for loci that are the only + # large expansion with that repeat unit + for repeat, genotypes in genotypes_by_repeat: + var gt_expanded: seq[Call] + for gt in genotypes: + if gt.is_large: + gt_expanded.add(gt) + if gt_expanded.len > 1: + break + if gt_expanded.len == 1: + gt_expanded[0].update_genotype(unplaced_counts[repeat]) + for gt in genotypes: + gt_fh.write_line gt.tostring() + + for repeat, count in unplaced_counts: + unplaced_fh.write_line &"{repeat}\t{count}" gt_fh.close bounds_fh.close diff --git a/src/strpkg/cluster.nim b/src/strpkg/cluster.nim index b82f643..62fcf7d 100644 --- a/src/strpkg/cluster.nim +++ b/src/strpkg/cluster.nim @@ -82,6 +82,14 @@ type Bounds* = object name*: string force_report*: bool +import hashes +proc hash*(b:Bounds): Hash = + var h:Hash = 0 + h = h !& b.tid + h = h !& b.left.int32 + h = h !& b.right.int32 + result = !$h + const bounds_header* = "#chrom\tleft\tright\trepeat\tname\tleft_most\tright_most\tcenter_mass\tn_left\tn_right\tn_total" proc `==`(a,b: Bounds): bool = @@ -135,7 +143,7 @@ proc parse_bed*(f:string, targets: seq[Target], window: uint32): seq[Bounds] = # Parse single line of a STRling bounds file proc parse_boundsline*(l:string, targets: seq[Target]): Bounds = var l_split = l.split("\t") - if len(l_split) != 11: + if len(l_split) notin [11, 12]: quit fmt"Error reading loci bed file. Expected 11 fields and got {len(l_split)} on line: {l}" result.tid = int32(get_tid(l_split[0], targets)) result.left = uint32(parseInt(l_split[1])) From df533a9a0c92fe9374a9d93dca6541ac87c60346 Mon Sep 17 00:00:00 2001 From: Harriet Dashnow Date: Tue, 3 Mar 2020 11:42:41 -0700 Subject: [PATCH 2/2] update sim to a random non-STR site and assume no spans or read files will be produced by STRling --- sim/combine_random_sim_results.py | 26 ++++++++++++-------------- sim/sim_shared.groovy | 3 +-- sim/simulate_random.groovy | 12 ++++++------ 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/sim/combine_random_sim_results.py b/sim/combine_random_sim_results.py index 405b915..139d861 100644 --- a/sim/combine_random_sim_results.py +++ b/sim/combine_random_sim_results.py @@ -99,9 +99,8 @@ def main(): parse_bounds(boundsfiles).to_csv(args.out + '-bounds.csv', index=False) spansfiles = glob.glob(args.str_dir+"/*-spanning.txt") - if len(spansfiles) == 0: - sys.exit('ERROR: No -spanning.txt files found in the given directory: ' + args.str_dir) - parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False) + if len(spansfiles) >= 0: + parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False) unplacedfiles = glob.glob(args.str_dir+"/*-unplaced.txt") if len(unplacedfiles) == 0: @@ -109,17 +108,16 @@ def main(): parse_unplaced(unplacedfiles).to_csv(args.out + '-unplaced.csv', index=False) readsfiles = glob.glob(args.str_dir+"/*-reads.txt") - if len(readsfiles) == 0: - sys.exit('ERROR: No -reads.txt files found in the given directory: ' + args.str_dir) - all_df_reads = [] - for f in readsfiles: - df_reads = pd.read_csv(f, sep='\t') - df_reads.rename(columns={"#chrom": "chrom"}, inplace=True) - df_reads['sim'] = get_sim_str(f) - all_df_reads.append(df_reads) - all_df = pd.concat(all_df_reads) - all_df = all_df.merge(bed_df, how='left', on='sim') - all_df.to_csv(args.out + '-results.csv', index=False) + if len(readsfiles) >= 0: + all_df_reads = [] + for f in readsfiles: + df_reads = pd.read_csv(f, sep='\t') + df_reads.rename(columns={"#chrom": "chrom"}, inplace=True) + df_reads['sim'] = get_sim_str(f) + all_df_reads.append(df_reads) + all_df = pd.concat(all_df_reads) + all_df = all_df.merge(bed_df, how='left', on='sim') + all_df.to_csv(args.out + '-results.csv', index=False) if __name__ == "__main__": main() diff --git a/sim/sim_shared.groovy b/sim/sim_shared.groovy index 30457ce..e82281d 100644 --- a/sim/sim_shared.groovy +++ b/sim/sim_shared.groovy @@ -146,8 +146,7 @@ str_call = { def sample = branch.name.prefix from (sample + ".str.bin", sample + ".bam", "strling-bounds.txt") produce( - sample + "-reads.txt", sample + "-bounds.txt", - sample + "-spanning.txt", sample + "-unplaced.txt", + sample + "-bounds.txt", sample + "-unplaced.txt", sample + "-genotype.txt") { def bamname = get_fname(input.bam) diff --git a/sim/simulate_random.groovy b/sim/simulate_random.groovy index 055a2cb..56f8e39 100644 --- a/sim/simulate_random.groovy +++ b/sim/simulate_random.groovy @@ -13,11 +13,11 @@ mutate_locus = { produce('*.bed') { exec """ $PYTHON $TOOLS/random_str_alleles.py - --locus "4 3076604 3076695 CAG" + --locus "7 117143769 117143769 CAG" --out $dir/ - --num 100 - --min -5 - --max 200 + --num 150 + --min 0 + --max 600 --fixed 0 --seed 7 """ @@ -36,7 +36,7 @@ simulate_str_reads = { ~/storage/git/STRling/src/strpkg/simulate_reads --fasta $REF --output $output.prefix - $input.hist + $input.cram $input.bed """ } @@ -44,7 +44,7 @@ simulate_str_reads = { combine = { exec """ - $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out HTT + $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out sim """ }