From 1565b5a18f3e9a824d74991df996f62b93190b49 Mon Sep 17 00:00:00 2001
From: hevmarriott <71556548+hevmarriott@users.noreply.github.com>
Date: Thu, 3 Dec 2020 11:31:10 +0000
Subject: [PATCH 1/2] Update DNAscan.py

1. Changed formatting of @RG fields so it is compatible with BWA-MEM
2. Updated intensive mode so compatible with GATK 4.1.9.0 (jar version) with HaplotypeCaller, and changed deprecated CombineVariants to MergeVcfs
3. Updating clinvar in ALSGeneScanner to clinvar_20200316
4. Other minor changes include:
    - making it possible to generate several java services at the same time i.e.  sequencing report in intensive mode
    - changing annotation to be compatible with hg38
    - debugging minor errors in script

Extra Notes
- If you want to use the provided variant catalog.json for ExpansionHunter with test data, you need to open the file and delete the first 2-3 lines before the main json content for it to work
- The default filter_string option is too strict for the test data, therefore need to add -filter "custom options" as highlighted in the DNAscan README
---
 scripts/DNAscan.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/scripts/DNAscan.py b/scripts/DNAscan.py
index 961e769..45361bb 100644
--- a/scripts/DNAscan.py
+++ b/scripts/DNAscan.py
@@ -589,7 +589,7 @@
 
     annovar_operations = "g,f,f"
 
-    annovar_protocols = "refGene,dbnsfp30a,clinvar_20170905,"
+    annovar_protocols = "refGene,dbnsfp30a,clinvar_20200316"
     
 # Y. adapt DB to reference
 
@@ -852,7 +852,7 @@
                     rg_option_hisat2 = " --rg-id %s --rg LB:%s --rg PL:%s  --rg PU:%s --rg SM:%s " % (
                         RG_ID, RG_LB, RG_PL, RG_PU, RG_SM)
 
-                    rg_option_bwa = " -R '@RG\tID:%s\tLB:%s\tPL:%s\tRGPU:%s\tSM:%s' " % (
+                    rg_option_bwa = " -R '@RG\\tID:%s\\tLB:%s\\tPL:%s\\tPU:%s\\tSM:%s' " % (
                         RG_ID, RG_LB, RG_PL, RG_PU, RG_SM)
 
                 else:
@@ -946,12 +946,14 @@
                     rg_option_hisat2 = " --rg-id %s --rg LB:%s --rg PL:%s  --rg PU:%s --rg SM:%s " % (
                         RG_ID, RG_LB, RG_PL, RG_PU, RG_SM)
 
-                    rg_option_bwa = " -R '@RG\tID:%s\tLB:%s\tPL:%s\tRGPU:%s\tSM:%s' " % (
+                    rg_option_bwa = " -R '@RG\\tID:%s\\tLB:%s\\tPL:%s\\tPU:%s\\tSM:%s' " % (
                         RG_ID, RG_LB, RG_PL, RG_PU, RG_SM)
 
                 else:
 
-                    rg_option = ""
+                    rg_option_hisat2 = ""
+                    
+                    rg_option_bwa = ""
 
                 os.system(
                     "%shisat2 %s --no-softclip --no-spliced-alignment -p %s -x %s -U %s | %s %ssamtools view -Sb -  | %ssambamba sort -t %s --tmpdir=%s -o %ssorted.bam /dev/stdin; %ssamtools index -@ %s %ssorted.bam"
@@ -1047,7 +1049,7 @@
                 "WARNING: The presence of VC.log in logs is telling you that the variant calling was already peformed, please remove VC.log if you wish to perform this stage anyway\n"
             )
 
-            variant_results_file = "%sresults/%s_sorted.vcf.gz" % (out,
+            variant_results_file = "%s%s_sorted.vcf.gz" % (out,
                                                                    sample_name)
 
         else:
@@ -1107,7 +1109,7 @@
 
                 while counter < int(num_cpu) + 1:
 
-                    command = "%sjava -jar %sGenomeAnalysisTK.jar %s -R %s -T HaplotypeCaller -I %s -L %smpileup_positions%s.bed -o %sgatk_indels%s.vcf" % (
+                    command = "%sjava -jar %sgatk-package-4.1.9.0-local.jar %s HaplotypeCaller -R %s -I %s -L %smpileup_positions%s.bed -O %sgatk_indels%s.vcf" % (
                         path_java, path_gatk, gatk_HC_custom_options, path_reference, bam_file, out,
                         str(counter), out, str(counter))
 
@@ -1182,7 +1184,7 @@
                         "%svcftools  --vcf %sfreebayes.vcf --minGQ 30 --minDP 2 --exclude-bed %smpileup_positions.bed  --recode --recode-INFO-all --out %sSNPs_only"
                         % (path_vcftools, out, out, out))
 
-                    os.system("%sSNPs_only.log" % (out))
+                    os.system("touch %sSNPs_only.log" % (out))
 
                     os.system(
                         "bgzip  %sSNPs_only.recode.vcf ; bgzip %sindels_only.recode.vcf "
@@ -1193,7 +1195,7 @@
                         % (path_tabix, out, path_tabix, out))
 
                     os.system(
-                        "%sjava -jar %sGenomeAnalysisTK.jar -T CombineVariants  -minimalVCF -R %s --variant %sSNPs_only.recode.vcf.gz --variant %sindels_only.recode.vcf.gz -o  %s%s.vcf --genotypemergeoption UNSORTED"
+                        "%sjava -jar %sgatkpackage-4.1.9.0-local.jar MergeVcfs -R %s -I %sSNPs_only.recode.vcf.gz -I %sindels_only.recode.vcf.gz -O  %s%s.vcf "
                         % (path_java, path_gatk, path_reference, out, out, out,
                            sample_name))
 
@@ -1350,11 +1352,11 @@
                reference, annovar_protocols, annovar_operations, out))
         if not debug and not alsgenescanner:
             os.system(
-                "rm %sannovar.vcf.hg19_multianno.txt %sannovar.vcf.avinput" %
+                "rm %sannovar.vcf.hg38_multianno.txt %sannovar.vcf.avinput" %
                 (out, out))
 
         os.system(
-            "mv %s/annovar.vcf.hg19_multianno.vcf %sresults/%s_annotated.vcf ; bgzip -f %sresults/%s_annotated.vcf ; %stabix -fp vcf %sresults/%s_annotated.vcf.gz"
+            "mv %s/annovar.vcf.hg38_multianno.vcf %sresults/%s_annotated.vcf ; bgzip -f %sresults/%s_annotated.vcf ; %stabix -fp vcf %sresults/%s_annotated.vcf.gz"
             % (out, out, sample_name, out, sample_name, path_tabix, out,
                sample_name))
 
@@ -1373,7 +1375,7 @@
 
         os.system("mv %s* %sresults/" % (variant_results_file, out))
 
-        variant_results_file = "%sresults/%s_sorted.vcf.gz" % (out,
+        variant_results_file = "%s%s_sorted.vcf.gz" % (out,
                                                                sample_name)
 
 # 15. Microbes screening
@@ -1599,7 +1601,7 @@
 
         if path_java != "":
 
-            java_option = "-j " + path_java + " "
+            java_option = "-j " + path_java + "java"
 
         else:
 
@@ -1742,7 +1744,7 @@
         os.system("touch  %slogs/iobio.log" % (out))
 
         print(
-            "\n\nIobio serces have been started at http://localhost:%s\n\nCopy and paste http://localhost:%s to select the service (vcf, bam, gene) and upload your data into the selected service\n\nIf you want to explore your variant calling results please copy and paste the following URL into your browser and upload the vcf file (../%sresults/%s_sorted.vcf.gz):\n\n"
+            "\n\nIobio services have been started at http://localhost:%s\n\nCopy and paste http://localhost:%s to select the service (vcf, bam, gene) and upload your data into the selected service\n\nIf you want to explore your variant calling results please copy and paste the following URL into your browser and upload the vcf file (../%s%s_sorted.vcf.gz):\n\n"
             % (port_num, port_num, out, sample_name),
             end='',
             flush=True)
@@ -1783,7 +1785,7 @@
 if alsgenescanner:
 
     os.system(
-        "python3 %s/alsgenescanner.py %s/annovar.vcf.hg19_multianno.txt %s/results/%s_alsgenescanner_all.txt"
+        "python3 %s/alsgenescanner.py %s/annovar.vcf.hg38_multianno.txt %s/results/%s_alsgenescanner_all.txt"
         % (path_scripts, out, out, sample_name))
     os.system(
         "cat %s/results/%s_alsgenescanner_all.txt | head -1 > %s/results/%s_alsgenescanner_alsod.txt; cat %s/results/%s_alsgenescanner_all.txt | grep -iwf %s/list_genes_alsod.txt >> %s/results/%s_alsgenescanner_alsod.txt"

From f24a0f3ad756d5260fa219c76396994e3fa66dcb Mon Sep 17 00:00:00 2001
From: hevmarriott <71556548+hevmarriott@users.noreply.github.com>
Date: Thu, 3 Dec 2020 12:02:01 +0000
Subject: [PATCH 2/2] Update analyse_list_of_samples.py

1. Paths module replaced with paths_configs
2. Python path removed and replaced with python3 in command line
---
 scripts/analyse_list_of_samples.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/analyse_list_of_samples.py b/scripts/analyse_list_of_samples.py
index f9f7d41..0be647d 100644
--- a/scripts/analyse_list_of_samples.py
+++ b/scripts/analyse_list_of_samples.py
@@ -16,20 +16,18 @@
 #   5.3 Run DNAscan for one sample
 ################################################################
 
-import argparse , os  ,  paths , os.path 
+import argparse , os, paths_configs , os.path 
 
 from argparse import RawTextHelpFormatter
 
 # 2. Define paths viriables
 
-python_path = paths.python_path
-
-dnascan_dir = paths.dnascan_dir
+dnascan_dir = paths_configs.dnascan_dir
 
 
 # 3. Define options from command line
 
-parser = argparse.ArgumentParser(prog='python analyse_list_of_samples.py ', usage='%(prog)s -format "string" -paired "string" -sample_list "string" -out_dir "string" -option_string "string"', description = '############Help Message############ \n\nThis is a script to run DNAscan on a list of samples. Each line of the list must contain the path to one sample. If samples are in paired reads in fastq format and you have two files per sample, these will have to be on the same line spaced bt a tab.\n\n E.g. sample.1.fq.gz  sample.2.fq.gz\n\nDNAscan uses the file paths.py to locate the needed tools and files. Please make sure your paths.py file is properly filled \n\nUsage example: \n\npython alalyse_list_of_files.py -option_string "-format fastq -mode intensive -reference hg19 -alignment -variantcalling -annotation" -out_dir /path/to/dir -sample_list list.txt -format bam\n\nPlease check the following list of required options\n\n################################################', formatter_class=RawTextHelpFormatter)
+parser = argparse.ArgumentParser(prog='python3 analyse_list_of_samples.py ', usage='%(prog)s -format "string" -paired "string" -sample_list "string" -out_dir "string" -option_string "string"', description = '############Help Message############ \n\nThis is a script to run DNAscan on a list of samples. Each line of the list must contain the path to one sample. If samples are in paired reads in fastq format and you have two files per sample, these will have to be on the same line spaced bt a tab.\n\n E.g. sample.1.fq.gz  sample.2.fq.gz\n\nDNAscan uses the file paths.py to locate the needed tools and files. Please make sure your paths.py file is properly filled \n\nUsage example: \n\npython alalyse_list_of_files.py -option_string "-format fastq -mode intensive -reference hg19 -alignment -variantcalling -annotation" -out_dir /path/to/dir -sample_list list.txt -format bam\n\nPlease check the following list of required options\n\n################################################', formatter_class=RawTextHelpFormatter)
 
 requiredNamed = parser.add_argument_group('required named arguments')
 
@@ -72,7 +70,7 @@
     
     if paired == "1" and format == "fastq" :
         
-        input_file_string = "-in %s -in2 %s" %(sample.split('\t')[0] , sample.split('\t')[1].strip())
+        input_file_string = "-in %s -in2 %s" %( sample.split('\t')[0] , sample.split('\t')[1].strip() )
                 
         sample_name = sample.split('\t')[0].split("/")[-1].split("1.f")[-2]
         
@@ -88,6 +86,6 @@
     
     # 5.3 Run DNAscan for one sample
     
-    os.system( "%s %sDNAscan.py %s -sample_name %s %s -out %s/%s/ " %( python_path , dnascan_dir , option_string , sample_name , input_file_string , out_dir , sample_name) )
+    os.system( "python3 %sDNAscan.py %s -sample_name %s %s -out %s/%s/ " %( dnascan_dir , option_string , sample_name , input_file_string , out_dir , sample_name) )
+        
         
-        
\ No newline at end of file