From 73f6808109ed363ed7fa5f8c9f0043cac15c2b88 Mon Sep 17 00:00:00 2001
From: Brent Pedersen <bpederse@gmail.com>
Date: Wed, 19 Feb 2020 13:19:22 -0700
Subject: [PATCH 1/2] proper simulations

---
 src/strpkg/call.nim    | 142 ++++++++++++++++++++---------------------
 src/strpkg/cluster.nim |  10 ++-
 2 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/src/strpkg/call.nim b/src/strpkg/call.nim
index 84fbe0d..1bb29b9 100644
--- a/src/strpkg/call.nim
+++ b/src/strpkg/call.nim
@@ -24,6 +24,24 @@ import ./unpack
 export tread
 export Soft
 
+const n_sims = 100
+const each_n = 2
+proc string_hash(s:string, sim_i:int): uint64 {.inline.} =
+  # djb's string hash
+  result = 5381
+  for c in s:
+    result = ((result shl 5) + result) + c.uint64
+
+  for c in $sim_i:
+    result = ((result shl 5) + result) + c.uint64
+
+proc filter_sims[T:tread|Support](trs: seq[T], sim_i: int, each_n:int): seq[T] =
+  result = trs
+  if result.len == 0: return
+  randomize(sim_i + 1)
+  shuffle(result)
+  result = result[0..<max(1, int(result.len/each_n))]
+
 proc call_main*() =
   var p = newParser("strling call"):
     option("-f", "--fasta", help="path to fasta file")
@@ -143,6 +161,8 @@ proc call_main*() =
     # Parse -bounds.txt file of regions. These will also be genotyped
     bounds = parse_bounds(args.bounds, opts.targets)
     stderr.write_line &"Read {len(bounds)} bounds from {args.bounds}"
+  else:
+    quit "-b bounds file is required for this branch"
 
   # Merge loci and bounds, with loci overwriting bounds
   for bound in bounds.mitems:
@@ -159,96 +179,76 @@ proc call_main*() =
   for locus in loci:
     bounds.add(locus)
 
-  var unplaced_counts = initCountTable[string]()
-  var genotypes_by_repeat = initTable[string, seq[Call]]()
 
   # Assign STR reads to provided bounds and genotype them
-  for bound in bounds.mitems:
-    var str_reads = assign_reads_locus(bound, treads_by_tid_rep)
-    if bound.right - bound.left > 1000'u32:
-      stderr.write_line "large bounds:" & $bound & " skipping"
-      continue
-    var (spans, median_depth) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq)
-    if spans.len > 5_000:
-      when defined(debug):
-        stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping."
-      continue
-    if median_depth == -1:
-      continue
-
-    var gt = genotype(bound, str_reads, spans, opts, float(median_depth))
-
-    var canon_repeat = bound.repeat.canonical_repeat
-    if not genotypes_by_repeat.hasKey(canon_repeat):
-      genotypes_by_repeat[canon_repeat] = @[]
-    genotypes_by_repeat[canon_repeat].add(gt)
-
-    #var estimate = spans.estimate_size(frag_dist)
-    bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth
-    when defined(debug):
-      for s in spans:
-        span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name)
-      var locusid = bound.id(opts.targets)
-      for r in str_reads:
-        reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid
-
-  # Cluster remaining reads and genotype
-  var ci = 0
-
-  for key, treads in treads_by_tid_rep.mpairs:
-    ## treads are for the single tid, repeat unit defined by key
-    for c in treads.cluster(max_dist=opts.window.uint32, min_supporting_reads=opts.min_support):
-      if c.reads[0].tid == -1:
-        unplaced_counts[c.reads[0].repeat.tostring] = c.reads.len
-        continue
+  t0 = cpuTime()
+
+  var spansByBound = initTable[Bounds, seq[Support]]()
+  var treadsByBound = initTable[Bounds, seq[tread]]()
+  var medByBound = initTable[Bounds, int]()
+  for sim_i in 0..<n_sims:
+    echo "sim:", sim_i, " time:", cpuTime() - t0
+    t0 = cpuTime()
+    var unplaced_counts = initCountTable[string]()
+    var genotypes_by_repeat = initTable[string, seq[Call]]()
 
-      var b: Bounds
-      var good_cluster: bool
-      (b, good_cluster) = bounds(c, min_clip, min_clip_total)
-      if good_cluster == false:
+    for bound in bounds.mitems:
+
+      if bound.right - bound.left > 1000'u32:
+        stderr.write_line "large bounds:" & $bound & " skipping"
         continue
 
-      var (spans, median_depth) = ibam.spanners(b, opts.window, frag_dist, opts.min_mapq)
+      if bound notin treadsByBound:
+        treadsByBound[bound] = assign_reads_locus(bound, treads_by_tid_rep)
+      var str_reads = treadsByBound[bound].filter_sims(sim_i, each_n)
+
+      if bound notin spansByBound:
+        var (s, md) = ibam.spanners(bound, opts.window, frag_dist, opts.min_mapq)
+        spansByBound[bound] = s
+        medByBound[bound] = md
+
+      var spans = spansByBound[bound].filter_sims(sim_i, each_n)
+      var median_depth = medByBound[bound]
+
       if spans.len > 5_000:
         when defined(debug):
-          stderr.write_line &"High depth for bound {opts.targets[b.tid].name}:{b.left}-{b.right} got {spans.len} pairs. Skipping."
+          stderr.write_line &"High depth for bound {opts.targets[bound.tid].name}:{bound.left}-{bound.right} got {spans.len} pairs. Skipping."
         continue
       if median_depth == -1:
         continue
 
-      var gt = genotype(b, c.reads, spans, opts, float(median_depth))
+      var gt = genotype(bound, str_reads, spans, opts, float(median_depth))
 
-      var canon_repeat = b.repeat.canonical_repeat
+      var canon_repeat = bound.repeat.canonical_repeat
       if not genotypes_by_repeat.hasKey(canon_repeat):
         genotypes_by_repeat[canon_repeat] = @[]
       genotypes_by_repeat[canon_repeat].add(gt)
 
       #var estimate = spans.estimate_size(frag_dist)
-      bounds_fh.write_line b.tostring(opts.targets) & "\t" & $median_depth
-
+      bounds_fh.write_line bound.tostring(opts.targets) & "\t" & $median_depth
       when defined(debug):
         for s in spans:
-          span_fh.write_line s.tostring(b, opts.targets[b.tid].name)
-        for s in c.reads:
-          reads_fh.write_line s.tostring(opts.targets) & "\t" & $ci
-      ci += 1
-
-  # Loop through again and refine genotypes for loci that are the only
-  # large expansion with that repeat unit
-  for repeat, genotypes in genotypes_by_repeat:
-    var gt_expanded: seq[Call]
-    for gt in genotypes:
-      if gt.is_large:
-        gt_expanded.add(gt)
-        if gt_expanded.len > 1:
-          break
-    if gt_expanded.len == 1:
-      gt_expanded[0].update_genotype(unplaced_counts[repeat])
-    for gt in genotypes:
-      gt_fh.write_line gt.tostring()
-
-  for repeat, count in unplaced_counts:
-      unplaced_fh.write_line &"{repeat}\t{count}"
+          span_fh.write_line s.tostring(bound, opts.targets[bound.tid].name)
+        var locusid = bound.id(opts.targets)
+        for r in str_reads:
+          reads_fh.write_line r.tostring(opts.targets) & "\t" & $locusid
+
+    # Loop through again and refine genotypes for loci that are the only
+    # large expansion with that repeat unit
+    for repeat, genotypes in genotypes_by_repeat:
+      var gt_expanded: seq[Call]
+      for gt in genotypes:
+        if gt.is_large:
+          gt_expanded.add(gt)
+          if gt_expanded.len > 1:
+            break
+      if gt_expanded.len == 1:
+        gt_expanded[0].update_genotype(unplaced_counts[repeat])
+      for gt in genotypes:
+        gt_fh.write_line gt.tostring()
+
+    for repeat, count in unplaced_counts:
+        unplaced_fh.write_line &"{repeat}\t{count}"
 
   gt_fh.close
   bounds_fh.close
diff --git a/src/strpkg/cluster.nim b/src/strpkg/cluster.nim
index b82f643..62fcf7d 100644
--- a/src/strpkg/cluster.nim
+++ b/src/strpkg/cluster.nim
@@ -82,6 +82,14 @@ type Bounds* = object
   name*: string
   force_report*: bool
 
+import hashes
+proc hash*(b:Bounds): Hash =
+  var h:Hash = 0
+  h = h !& b.tid
+  h = h !& b.left.int32
+  h = h !& b.right.int32
+  result = !$h
+
 const bounds_header* = "#chrom\tleft\tright\trepeat\tname\tleft_most\tright_most\tcenter_mass\tn_left\tn_right\tn_total"
 
 proc `==`(a,b: Bounds): bool =
@@ -135,7 +143,7 @@ proc parse_bed*(f:string, targets: seq[Target], window: uint32): seq[Bounds] =
 # Parse single line of a STRling bounds file
 proc parse_boundsline*(l:string, targets: seq[Target]): Bounds =
   var l_split = l.split("\t")
-  if len(l_split) != 11:
+  if len(l_split) notin [11, 12]:
     quit fmt"Error reading loci bed file. Expected 11 fields and got {len(l_split)} on line: {l}"
   result.tid = int32(get_tid(l_split[0], targets))
   result.left = uint32(parseInt(l_split[1]))

From df533a9a0c92fe9374a9d93dca6541ac87c60346 Mon Sep 17 00:00:00 2001
From: Harriet Dashnow <h.dashnow@gmail.com>
Date: Tue, 3 Mar 2020 11:42:41 -0700
Subject: [PATCH 2/2] update sim to a random non-STR site and assume no spans
 or read files will be produced by STRling

---
 sim/combine_random_sim_results.py | 26 ++++++++++++--------------
 sim/sim_shared.groovy             |  3 +--
 sim/simulate_random.groovy        | 12 ++++++------
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/sim/combine_random_sim_results.py b/sim/combine_random_sim_results.py
index 405b915..139d861 100644
--- a/sim/combine_random_sim_results.py
+++ b/sim/combine_random_sim_results.py
@@ -99,9 +99,8 @@ def main():
     parse_bounds(boundsfiles).to_csv(args.out + '-bounds.csv', index=False)
 
     spansfiles = glob.glob(args.str_dir+"/*-spanning.txt")
-    if len(spansfiles) == 0:
-        sys.exit('ERROR: No -spanning.txt files found in the given directory: ' + args.str_dir)
-    parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)
+    if len(spansfiles) >= 0:
+        parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)
 
     unplacedfiles = glob.glob(args.str_dir+"/*-unplaced.txt")
     if len(unplacedfiles) == 0:
@@ -109,17 +108,16 @@ def main():
     parse_unplaced(unplacedfiles).to_csv(args.out + '-unplaced.csv', index=False)
 
     readsfiles = glob.glob(args.str_dir+"/*-reads.txt")
-    if len(readsfiles) == 0:
-        sys.exit('ERROR: No -reads.txt files found in the given directory: ' + args.str_dir)
-    all_df_reads = []
-    for f in readsfiles:
-        df_reads = pd.read_csv(f, sep='\t')
-        df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
-        df_reads['sim'] = get_sim_str(f)
-        all_df_reads.append(df_reads)
-    all_df = pd.concat(all_df_reads)
-    all_df = all_df.merge(bed_df, how='left', on='sim')
-    all_df.to_csv(args.out + '-results.csv', index=False)
+    if len(readsfiles) >= 0:
+        all_df_reads = []
+        for f in readsfiles:
+            df_reads = pd.read_csv(f, sep='\t')
+            df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
+            df_reads['sim'] = get_sim_str(f)
+            all_df_reads.append(df_reads)
+        all_df = pd.concat(all_df_reads)
+        all_df = all_df.merge(bed_df, how='left', on='sim')
+        all_df.to_csv(args.out + '-results.csv', index=False)
 
 if __name__ == "__main__":
     main()
diff --git a/sim/sim_shared.groovy b/sim/sim_shared.groovy
index 30457ce..e82281d 100644
--- a/sim/sim_shared.groovy
+++ b/sim/sim_shared.groovy
@@ -146,8 +146,7 @@ str_call = {
     def sample = branch.name.prefix
 
     from (sample + ".str.bin", sample + ".bam", "strling-bounds.txt") produce(
-            sample + "-reads.txt", sample + "-bounds.txt",
-            sample + "-spanning.txt", sample + "-unplaced.txt",
+            sample + "-bounds.txt", sample + "-unplaced.txt",
             sample + "-genotype.txt") {
     
         def bamname = get_fname(input.bam)
diff --git a/sim/simulate_random.groovy b/sim/simulate_random.groovy
index 055a2cb..56f8e39 100644
--- a/sim/simulate_random.groovy
+++ b/sim/simulate_random.groovy
@@ -13,11 +13,11 @@ mutate_locus = {
     produce('*.bed') {
         exec """
             $PYTHON $TOOLS/random_str_alleles.py
-                --locus "4   3076604 3076695 CAG"
+                --locus "7 117143769 117143769 CAG"
                 --out $dir/
-                --num 100
-                --min -5
-                --max 200
+                --num 150
+                --min 0
+                --max 600
                 --fixed 0
                 --seed 7
         """
@@ -36,7 +36,7 @@ simulate_str_reads = {
         ~/storage/git/STRling/src/strpkg/simulate_reads
             --fasta $REF
             --output $output.prefix
-            $input.hist
+            $input.cram
             $input.bed
     """
 }
@@ -44,7 +44,7 @@ simulate_str_reads = {
 combine = {
 
         exec """
-            $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out HTT
+            $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out sim
         """
 }