From 4052204a7b87dcad182037aaa8812cc2338555b0 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 18 Apr 2022 16:07:26 -0700 Subject: [PATCH 001/172] laneprocess.py uses correct SAMPLE_NAME Logic now matches that seen in the rest of our pipeline - prefer using the alignment's sample_name, and fall back to constructing it manually only when necessary. This should resolve the collation issues that have been dogging us this year. --- scripts/laneprocess.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 7e8f85bf..5ed3d1fe 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -211,11 +211,17 @@ def create_script(self, processing_info): fastq_directory = lane["directory"] barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] - spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) + try: + # Preferred name + spreadsheet_name = lane['alignments'][0]['sample_name'] + except (KeyError, IndexError): + # Fallback method, doesn't always have the same barcode string + spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) + logging.warning("No alignment sample_name for lane, using %s instead" % spreadsheet_name) if not os.path.exists(fastq_directory): logging.critical("fastq directory %s does not exist, cannot continue" % fastq_directory) - return False + return False script_file = os.path.join( fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) ) From 621b345321386d316723417f3b71fb3367f0d03d Mon Sep 17 00:00:00 2001 From: solexa Date: Sun, 24 Apr 2022 13:03:04 -0700 Subject: [PATCH 002/172] apply fix to right file, mark old file deprecated --- scripts/apilaneprocess.py | 8 +++++++- scripts/laneprocess.py | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 6d295432..16dc98be 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -183,7 +183,13 @@ def create_script(self, processing_info): fastq_directory = os.path.join(alt_dir, "fastq", "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"]) barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] - spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) + try: + # Preferred name + spreadsheet_name = lane['alignments'][0]['sample_name'] + except (KeyError, IndexError): + # Fallback method, doesn't always have the same barcode string + spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) + logging.warning("No alignment sample_name for lane, using %s instead" % spreadsheet_name) if not os.path.exists(fastq_directory): logging.critical("fastq directory %s does not exist, cannot continue" % fastq_directory) diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 5ed3d1fe..3edd0d76 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -1,3 +1,5 @@ +""" This script is deprecated! """ + import json import os import sys @@ -11,6 +13,8 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +logging.warn("This script is deprecated - consider using apilaneprocess.py instead!") + STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') script_options = { @@ -214,6 +218,7 @@ def create_script(self, processing_info): try: # Preferred name spreadsheet_name = lane['alignments'][0]['sample_name'] + logging.warning("Spreadsheet name: %s", spreadsheet_name) except (KeyError, IndexError): # Fallback method, doesn't always have the same barcode string spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) From 58603ec48f4be83099ebc76ee2e9d8606aa635a2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 5 Jul 2022 13:38:06 -0700 Subject: [PATCH 003/172] Use CopyComplete.txt to start processing CopyComplete.txt is a better signal that a flowcell is ready for processing than RTAComplete.txt. Older sequencers did not create CopyComplete.txt, I believe. --- scripts/copy_notify.py | 2 +- scripts/flowcells/setup.sh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index e35e2384..cf654ead 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -99,7 +99,7 @@ def check_copy(sequencer_folder): if flowcell_reads[sequencer_folder]: #copy_filename = copy_complete_filename % flowcell_reads[sequencer_folder] - return os.path.exists("%s/%s" % (sequencer_folder, "RTAComplete.txt")) + return os.path.exists("%s/%s" % (sequencer_folder, "CopyComplete.txt")) else: return False diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index bcb3ca1a..205d1f0f 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -602,8 +602,8 @@ python3 "$STAMPIPES/scripts/lims/upload_data.py" \ # Register as "Sequencing" in LIMS lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flowcell_run_status/2/" -# Wait for RTAComplete -while [ ! -e "$illumina_dir/RTAComplete.txt" ] ; do sleep 60 ; done +# Wait for CopyComplete +while [ ! -e "$illumina_dir/CopyComplete.txt" ] ; do sleep 60 ; done # Register as "Processing" in LIMS lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flowcell_run_status/3/" @@ -649,8 +649,8 @@ python3 "$STAMPIPES/scripts/lims/upload_data.py" \ # Register as "Sequencing" in LIMS lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flowcell_run_status/2/" -# Wait for RTAComplete -while [ ! -e "$illumina_dir/RTAComplete.txt" ] ; do sleep 60 ; done +# Wait for CopyComplete +while [ ! -e "$illumina_dir/CopyComplete.txt" ] ; do sleep 60 ; done # Register as "Processing" in LIMS lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flowcell_run_status/3/" @@ -865,7 +865,7 @@ __BCL2FASTQ__ fi -if [ -e "RTAComplete.txt" ] ; then +if [ -e "CopyComplete.txt" ] ; then echo -e "Setup complete. To kick everything off, type:\n\nbash run_bcl2fastq.sh" else echo -e "Setup complete, sequencing still in progress. To queue everything up, type:\n\nnohup bash run_bcl2fastq.sh &" From 536ed632442f978ceaab7c5978251f685da040f9 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 5 Jul 2022 13:44:34 -0700 Subject: [PATCH 004/172] Switch initial flowcell processing to hpcz-1 hpcz-2 was decommissioned, switching default queue for this. --- scripts/flowcells/setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 205d1f0f..cde25ae3 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -224,7 +224,7 @@ case $run_type in samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-2" + queue="hpcz-1" make_novaseq_samplesheet 2 > SampleSheet.csv bcl_tasks=1 @@ -250,7 +250,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-2" + queue="hpcz-1" make_novaseq_samplesheet 2 > SampleSheet.csv bcl_tasks=1 @@ -289,7 +289,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-2" + queue="hpcz-1" make_novaseq_samplesheet 4 > SampleSheet.csv bcl_tasks=1 @@ -328,7 +328,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-2" + queue="hpcz-1" make_novaseq_samplesheet 4 > SampleSheet.csv bcl_tasks=1 From 3b99fd3be66dde1d7a723c644a0cf7e54927deee Mon Sep 17 00:00:00 2001 From: solexa Date: Thu, 7 Jul 2022 12:45:42 -0700 Subject: [PATCH 005/172] chore: update default queue names for Altius --- nextflow.config | 2 +- processes/bwa/process_bwa_paired.bash | 2 +- processes/bwa/process_bwa_unpaired.bash | 2 +- scripts/aggregateprocess.py | 2 +- scripts/alignprocess.py | 2 +- scripts/apilaneprocess.py | 2 +- scripts/flowcells/demux_flowcell.sh | 2 +- scripts/flowcells/setup.sh | 22 +++++++++++----------- scripts/laneprocess.py | 2 +- 9 files changed, 19 insertions(+), 19 deletions(-) diff --git a/nextflow.config b/nextflow.config index a2c67f4f..ef12186d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,7 +32,7 @@ profiles { cluster { process { executor = 'slurm' - queue = 'hpcz-1' + queue = 'hpcz-2' errorStrategy = { task.exitStatus == 143 ? 'retry' : 'terminate' } //errorStrategy 'retry' maxRetries = 3 diff --git a/processes/bwa/process_bwa_paired.bash b/processes/bwa/process_bwa_paired.bash index c1f31b74..77a9fe03 100644 --- a/processes/bwa/process_bwa_paired.bash +++ b/processes/bwa/process_bwa_paired.bash @@ -15,7 +15,7 @@ module load pysam/0.9.0 FINAL_BAM=${SAMPLE_NAME}.sorted.bam UNIQUES_BAM=${SAMPLE_NAME}.uniques.sorted.bam -export QUEUE=queue2 +export QUEUE=queue0 export FASTQ_TMP=$ALIGN_DIR/fastq diff --git a/processes/bwa/process_bwa_unpaired.bash b/processes/bwa/process_bwa_unpaired.bash index 3bb20e47..65c3ed88 100644 --- a/processes/bwa/process_bwa_unpaired.bash +++ b/processes/bwa/process_bwa_unpaired.bash @@ -20,7 +20,7 @@ module load python/3.5.1 module load pysam/0.9.0 module load python/2.7.11 -export QUEUE=queue2 +export QUEUE=queue0 MAX_MISMATCHES=2 MIN_MAPPING_QUALITY=10 diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index 8ea6959d..f235d734 100644 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -24,7 +24,7 @@ "overwrite": False, "script_name": "run.bash", "qsub_prefix": ".agg", - "qsub_queue": "queue2", + "qsub_queue": "queue0", "dry_run": False, "aggregation_base_directory": None, "aggregation_directory": None, diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 4a96bf7f..ac7826e6 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -25,7 +25,7 @@ "tag": None, "outfile": os.path.join(os.getcwd(), "run.bash"), "sample_script_basename": "run.bash", - "qsub_queue": "queue2", + "qsub_queue": "queue0", "qsub_prefix": ".proc", "dry_run": False, "no_mask": False, diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 16dc98be..c6a955ca 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -23,7 +23,7 @@ "outfile": os.path.join(os.getcwd(), "run.bash"), "sample_script_basename": "run.bash", "qsub_prefix": ".proc", - "queue": "queue2", + "queue": "queue0", "dry_run": False, "no_mask": False, "bases_mask": None, diff --git a/scripts/flowcells/demux_flowcell.sh b/scripts/flowcells/demux_flowcell.sh index 161b47ee..00efeba8 100755 --- a/scripts/flowcells/demux_flowcell.sh +++ b/scripts/flowcells/demux_flowcell.sh @@ -36,7 +36,7 @@ processing= LANE= dryrun= mismatches=0 -queue='queue2' +queue='queue0' while getopts ":hi:o:p:m:q:l:n" opt ; do case $opt in h) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index cde25ae3..ef8b8012 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -224,7 +224,7 @@ case $run_type in samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-1" + queue="hpcz-2" make_novaseq_samplesheet 2 > SampleSheet.csv bcl_tasks=1 @@ -250,7 +250,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-1" + queue="hpcz-2" make_novaseq_samplesheet 2 > SampleSheet.csv bcl_tasks=1 @@ -289,7 +289,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-1" + queue="hpcz-2" make_novaseq_samplesheet 4 > SampleSheet.csv bcl_tasks=1 @@ -328,7 +328,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" - queue="hpcz-1" + queue="hpcz-2" make_novaseq_samplesheet 4 > SampleSheet.csv bcl_tasks=1 @@ -368,7 +368,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--nextseq" - queue="queue2" + queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 @@ -406,7 +406,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--hiseq4k" - queue="queue2" + queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1-8 @@ -444,7 +444,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue2" + queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 set +e @@ -469,7 +469,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue2" + queue="queue0" minidemux="True" # placeholder cp /home/dchee7/projects/guide-seq/data/samplesheets/SampleSheet.csv SampleSheet.csv @@ -491,7 +491,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue2" + queue="queue0" minidemux="True" # placeholder cp /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv SampleSheet.csv @@ -513,7 +513,7 @@ _U_ samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue2" + queue="queue0" minidemux="True" # placeholder cat /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv > SampleSheet.csv @@ -850,7 +850,7 @@ bash fastqc.bash python3 "$STAMPIPES/scripts/alignprocess.py" \ --flowcell "$flowcell" \ --auto_aggregate \ - --qsub-queue queue2 \ + --qsub-queue queue0 \ --outfile run_alignments.bash # Set up of flowcell aggregations diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 3edd0d76..489fc2e9 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -25,7 +25,7 @@ "outfile": os.path.join(os.getcwd(), "run.bash"), "sample_script_basename": "run.bash", "qsub_prefix": ".proc", - "queue": "queue2", + "queue": "queue0", "dry_run": False, "no_mask": False, "bases_mask": None, From 4c82c7c517e304fee2eec28de8a7ec51a704cf2c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 20 Jul 2022 10:29:14 -0700 Subject: [PATCH 006/172] Add module for bcl2fastq - contains samplesheet generation --- modules/bcl2fastq.nf | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 modules/bcl2fastq.nf diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf new file mode 100644 index 00000000..42d81e71 --- /dev/null +++ b/modules/bcl2fastq.nf @@ -0,0 +1,166 @@ +nextflow.enable.dsl=2 + +@Grab('com.xlson.groovycsv:groovycsv:1.3') +import com.xlson.groovycsv.CsvParser + +params.input_directory = "" +params.bcl2fastq_threads = 20 + +params.bcl2fastq_sample_config_tsv = "" +params.bases_mask = "" + +params.bcl2fastq_tiles = "*" + + +params.samplesheet_header = """[Header] + |Workflow,GenerateFASTQ""".stripMargin() + +// Functions + +def parse_sample_config(sample_config_tsv) { + def data = new CsvParser().parseCsv(sample_config_tsv, separator: "\t") + def sample_info = [] + for (sample in data) { + sample_info.add( sample ) + } + //sample_info = sample_info.collect { it[]} + for (s in sample_info) { + println "sample is ${s}" + } + return sample_info +} + +def validate_sample_config(sample_info) { + // Check for required info + sample_info.collect { + // println "Sample info: ${it}" + assert it.lane > 0 : "Sample has no lane: ${it}" + assert it.barcode_index.size() > 0 : "Sample has no barcode index: ${it}" + assert it.name : "Sample has name: ${it}" + } +} + + +workflow { + def txt = file(params.bcl2fastq_sample_config_tsv).text + def sample_info = parse_sample_config(txt) + validate_sample_config(sample_info) + + BCL2DEMUX( + params.input_directory, + Channel.from(sample_info), + "*", + ) + +} + +workflow test { + // Dunno yet how to test this... Maybe with a real flowcell, but only a couple of tiles of data? + def tiles = "s_1_0002" + def input_dir = "/net/seq/data2/sequencers/220627_A01698_0053_BH2TCJDSX5" + def txt = """name\tbarcode_index\tlane + |N701\tTAAGGCGA\t1 + |N702\tCGTACTAG\t1 + |N703\tAGGCAGAA\t1 + |N704\tTCCTGAGC\t1 + |N705\tGGACTCCT\t1 + |N706\tTAGGCATG\t1 + |N707\tCTCTCTAC\t1 + |N708\tCAGAGAGG\t1 + |N709\tGCTACGCT\t1 + |N710\tCGAGGCTG\t1 + |N711\tAAGAGGCA\t1 + |N712\tGTAGAGGA\t1""".stripMargin() + + def sample_info = parse_sample_config(txt) + validate_sample_config(sample_info) + + + BCL2DEMUX ( + input_dir, + sample_info, + tiles + ) + +} + + +workflow BCL2DEMUX { + + take: + illumina_dir + sample_info + tiles + // TODO + + main: + text = generate_samplesheet( [params.samplesheet_header, sample_info]) + + + //emit: + // TODO + +} + + +// process demux_python { + +// } + +process generate_samplesheet { + + input: + tuple val(header), val(sample_info) + + output: + file("Samplesheet.csv") + + shell: + sheet = [ + header, + "", + "[Settings]", + "Lane,SampleID,index", + *sample_info.collect { "${it.lane},${it.name},${it.barcode_index}" } + ].join("\n") + + + ''' + printf '!{sheet}' > Samplesheet.csv + ''' +} + +// process generate_samplesheet_nodemux { + +// } + + +process bcl2fastq { + + container "dceoy/bcl2fastq@sha256:6d7233f2160721d6cb62f77a127d499597f4b35bb435cc8265d05f5bf54c7b94" + + input: + tuple file(illumina_dir), file(samplesheet), val(tiles) + + output: + file("output/*") + + shell: + ''' + mkdir output + bcl2fastq \ + --input-dir "!{illumina_dir}/Data/Intensities/BaseCalls" \ + --samplesheet "${samplesheet}" \ + --use-bases-mask "!{bcl_mask}" \ + --output-dir "output/" \ + --barcode-mismatches "!{mismatches}" \ + --tiles "!{tiles}" \ + --loading-threads $(( SLURM_CPUS_PER_TASK / 2 )) \ + --writing-threads $(( SLURM_CPUS_PER_TASK / 2 )) \ + --processing-threads $(( SLURM_CPUS_PER_TASK )) + ''' +} + +// process bclconvert { + +// } From ca3a64d380e6aae3b86b6393814c921e452f6b2c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 24 Jul 2022 14:27:31 -0700 Subject: [PATCH 007/172] Add test of alt-seq pipeline --- modules/bcl2fastq.nf | 58 ++++++++-------- modules/cram.nf | 77 ++++++++++++++++++++- modules/utility.nf | 15 +++-- processes/altseq/altseq.nf | 133 +++++++++++++++++++++++++++++++++++++ third_party/fetch.sh | 6 ++ 5 files changed, 253 insertions(+), 36 deletions(-) create mode 100644 processes/altseq/altseq.nf create mode 100755 third_party/fetch.sh diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf index 42d81e71..6d13ed3c 100644 --- a/modules/bcl2fastq.nf +++ b/modules/bcl2fastq.nf @@ -13,17 +13,16 @@ params.bcl2fastq_tiles = "*" params.samplesheet_header = """[Header] + |Project Name,Stamlab |Workflow,GenerateFASTQ""".stripMargin() // Functions - def parse_sample_config(sample_config_tsv) { def data = new CsvParser().parseCsv(sample_config_tsv, separator: "\t") def sample_info = [] for (sample in data) { sample_info.add( sample ) } - //sample_info = sample_info.collect { it[]} for (s in sample_info) { println "sample is ${s}" } @@ -33,7 +32,6 @@ def parse_sample_config(sample_config_tsv) { def validate_sample_config(sample_info) { // Check for required info sample_info.collect { - // println "Sample info: ${it}" assert it.lane > 0 : "Sample has no lane: ${it}" assert it.barcode_index.size() > 0 : "Sample has no barcode index: ${it}" assert it.name : "Sample has name: ${it}" @@ -55,10 +53,10 @@ workflow { } workflow test { - // Dunno yet how to test this... Maybe with a real flowcell, but only a couple of tiles of data? - def tiles = "s_1_0002" + // Test with a subset of tiles + def tiles = "s_1_121[0-9]" def input_dir = "/net/seq/data2/sequencers/220627_A01698_0053_BH2TCJDSX5" - def txt = """name\tbarcode_index\tlane + def config_txt = """name\tbarcode_index\tlane |N701\tTAAGGCGA\t1 |N702\tCGTACTAG\t1 |N703\tAGGCAGAA\t1 @@ -72,7 +70,7 @@ workflow test { |N711\tAAGAGGCA\t1 |N712\tGTAGAGGA\t1""".stripMargin() - def sample_info = parse_sample_config(txt) + def sample_info = parse_sample_config(config_txt) validate_sample_config(sample_info) @@ -91,14 +89,14 @@ workflow BCL2DEMUX { illumina_dir sample_info tiles - // TODO main: - text = generate_samplesheet( [params.samplesheet_header, sample_info]) - + generate_samplesheet( [params.samplesheet_header, sample_info] ) + .map { it -> [ illumina_dir, it, tiles] } + | bcl2fastq - //emit: - // TODO + emit: + bcl2fastq.out } @@ -116,17 +114,20 @@ process generate_samplesheet { file("Samplesheet.csv") shell: + settings = "" sheet = [ header, "", "[Settings]", - "Lane,SampleID,index", - *sample_info.collect { "${it.lane},${it.name},${it.barcode_index}" } + settings, + "[Data]", + "Lane,SampleID,SampleName,index", + *sample_info.collect { "${it.lane},${it.name},${it.name},${it.barcode_index}" } ].join("\n") ''' - printf '!{sheet}' > Samplesheet.csv + echo '!{sheet}' > Samplesheet.csv ''' } @@ -138,27 +139,28 @@ process generate_samplesheet { process bcl2fastq { container "dceoy/bcl2fastq@sha256:6d7233f2160721d6cb62f77a127d499597f4b35bb435cc8265d05f5bf54c7b94" + cpus {cpus} input: - tuple file(illumina_dir), file(samplesheet), val(tiles) + tuple path(illumina_dir), path(samplesheet), val(tiles) output: - file("output/*") + file("output/*fastq.gz") shell: - ''' - mkdir output + cpus = 10 + ''' + workdir=$PWD + outdir=$workdir/output + mkdir -p "$outdir" + cd "!{illumina_dir}" bcl2fastq \ - --input-dir "!{illumina_dir}/Data/Intensities/BaseCalls" \ - --samplesheet "${samplesheet}" \ - --use-bases-mask "!{bcl_mask}" \ - --output-dir "output/" \ - --barcode-mismatches "!{mismatches}" \ + --input-dir "Data/Intensities/BaseCalls" \ + --sample-sheet "$workdir/!{samplesheet}" \ + --output-dir "$outdir" \ --tiles "!{tiles}" \ - --loading-threads $(( SLURM_CPUS_PER_TASK / 2 )) \ - --writing-threads $(( SLURM_CPUS_PER_TASK / 2 )) \ - --processing-threads $(( SLURM_CPUS_PER_TASK )) - ''' + --barcode-mismatches 1 + ''' } // process bclconvert { diff --git a/modules/cram.nf b/modules/cram.nf index 570e08d2..672da60c 100644 --- a/modules/cram.nf +++ b/modules/cram.nf @@ -44,7 +44,7 @@ process encode_cram { writeindexflag = params.cram_write_index ? "--write-index" : "" """ samtools view \ - -T "${reference}" \ + --reference "${reference}" \ -C -o "${output_cram_name}" \ --output-fmt-option "${fmt_options}" \ --threads "${params.cram_compression_threads}" \ @@ -53,6 +53,80 @@ process encode_cram { """ } +process sort_and_encode_cram_no_ref { + + tag "${meta.id}" + + module "samtools/1.12" + container "quay.io/biocontainers/samtools:1.12--h9aed4be_1" + + cpus Math.ceil(params.cram_compression_threads / 2) + + input: + tuple val(meta), path(input_bam) + + output: + tuple val(meta), path(output_cram_name), emit: cram + tuple val(meta), path(output_cram_name), path("${output_cram_name}.crai"), emit: cram_with_index optional true + + script: + output_cram_name = "${input_bam.baseName}.cram" + fmt_options = [ + "no_ref=1", + "version=${params.cram_version}", + "level=${params.cram_compression_level}", + "lossy_names=${params.cram_lossy_names}", + "${params.cram_compress_other_args}", + ].join(",") + writeindexflag = params.cram_write_index ? "--write-index" : "" + """ + samtools sort \ + -o "${output_cram_name}" \ + --output-fmt-option "${fmt_options}" \ + --threads "${params.cram_compression_threads}" \ + ${writeindexflag} \ + "${input_bam}" + """ +} + + +process sort_and_encode_cram { + + tag "${meta.id}" + + module "samtools/1.12" + container "quay.io/biocontainers/samtools:1.12--h9aed4be_1" + + cpus Math.ceil(params.cram_compression_threads / 2) + + input: + tuple val(meta), path(input_bam), path(reference) + + output: + tuple val(meta), path(output_cram_name), emit: cram + tuple val(meta), path(output_cram_name), path("${output_cram_name}.crai"), emit: cram_with_index optional true + + script: + output_cram_name = "${input_bam.baseName}.cram" + fmt_options = [ + "version=${params.cram_version}", + "level=${params.cram_compression_level}", + "lossy_names=${params.cram_lossy_names}", + "${params.cram_compress_other_args}", + ].join(",") + writeindexflag = params.cram_write_index ? "--write-index" : "" + + """ + samtools sort \ + --reference "${reference}" \ + -o "${output_cram_name}" \ + --output-fmt-option "${fmt_options}" \ + --threads "${params.cram_compression_threads}" \ + ${writeindexflag} \ + "${input_bam}" + """ +} + process encode_cram_no_ref { tag "${meta.id}" @@ -63,6 +137,7 @@ process encode_cram_no_ref { input: tuple val(meta), path(input_bam) + tuple val(meta), path(input_bam) output: tuple val(meta), path(output_cram_name), emit: cram diff --git a/modules/utility.nf b/modules/utility.nf index 13e4d995..b2398097 100644 --- a/modules/utility.nf +++ b/modules/utility.nf @@ -1,6 +1,7 @@ /// This file is only for "utility" processes that are extremely generic. -params.publishmode = "link" +params.outdir = "output" +params.publishmode = "copy" process publish_and_rename { @@ -12,10 +13,10 @@ process publish_and_rename { output: path(filename) - script: - """ - ln -s "__infile__" "$filename" - """ + shell: + ''' + ln -s $(readlink -f "__infile__") "!{filename}" + ''' } process publish { @@ -30,8 +31,8 @@ process publish { path filename, includeInputs: true script: - """ - """ + """ + """ } process publish_with_meta { diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf new file mode 100644 index 00000000..3390d861 --- /dev/null +++ b/processes/altseq/altseq.nf @@ -0,0 +1,133 @@ +nextflow.enable.dsl=2 + +@Grab('com.xlson.groovycsv:groovycsv:1.3') +import com.xlson.groovycsv.CsvParser + +include { BCL2DEMUX } from "../../modules/bcl2fastq.nf" +include { sort_and_encode_cram } from "../../modules/cram.nf" +include { publish_and_rename; publish } from "../../modules/utility.nf" + +params.sample_config_tsv = "" +params.input_directory = "" + + +// Functions +def parse_sample_config(sample_config_tsv) { + def data = new CsvParser().parseCsv(sample_config_tsv, separator: "\t") + def sample_info = [] + for (sample in data) { + sample_info.add( sample ) + } + return sample_info +} + +def validate_sample_config(sample_info) { + // Check for required info + sample_info.collect { + assert it.lane > 0 : "Sample has no lane: ${it}" + assert it.barcode_index.size() > 0 : "Sample has no barcode index: ${it}" + assert it.name : "Sample has name: ${it}" + } +} + + +// test workflow +workflow test { + + def star_exe = file("${workflow.projectDir}/../../third_party/STAR") + def genome_dir = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/") + def genome_fa = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified") + def barcode_whitelist = file("/net/seq/data2/projects/prime_seq/barcodes-combined.txt") + + def sample_info = parse_sample_config(file(params.sample_config_tsv).text) + validate_sample_config(sample_info) + + BCL2DEMUX( + params.input_directory, + sample_info, + "s_1_1101" + ) + | flatMap { it.sort(); it.collate(2) } + | map { [ + sample_info.find(s -> it[0].baseName.startsWith("${s.name}_S") ), + it[0], + it[1], + ]} + | filter { it[0] != null } + | view { "it is $it" } + | set { fq_files } + + fq_files.flatMap { [ it[1], it[2] ] } | publish + + align( + star_exe, + genome_dir, + barcode_whitelist, + fq_files, + ) + + align.out.aligned_bam + | map { [ + [name: it[0].name, id: it[0].name, barcode_index: it[0].barcode_index, lane: it[0].lane] , + it[1], + genome_fa, + ] } + | sort_and_encode_cram + + sort_and_encode_cram.out.cram + | map { ["${it[0].name}.sorted.cram", it[1]] } + | publish_and_rename + +} + + +// TODO once we figure out how it works +workflow ALTSEQ { + + } + +process align { + + memory "108681M" + cpus 5 + + input: + file star_exe + file genome_dir + file barcode_whitelist + tuple val(meta), file(fq1), file(fq2) + + + output: + tuple val(meta), file("Aligned.out.bam"), emit: aligned_bam + tuple val(meta), file("Solo.out"), emit: solo_directory + + shell: + ''' + tmpdir=$(mktemp -d) + "./!{star_exe}" \ + --genomeDir "!{genome_dir}" \ + --readFilesIn "!{fq2}" "!{fq1}" \ + --soloType CB_UMI_Simple \ + --soloCellReadStats Standard \ + --clip3pAdapterSeq AAAAAAAAAA \ + --clip3pAdapterMMp 0.1 \ + --soloCBstart 1 \ + --soloCBlen 12 \ + --soloUMIstart 13 \ + --soloUMIlen 16 \ + --soloCBwhitelist "!{barcode_whitelist}" \ + --soloCellFilter EmptyDrops_CR 96 .99 10 45000 90000 100000 0.01 20000 0.01 10000 \ + --quantMode "TranscriptomeSAM" \ + --soloFeatures Gene GeneFull GeneFull_ExonOverIntron GeneFull_Ex50pAS \ + --soloMultiMappers Unique PropUnique Uniform Rescue EM \ + --readFilesCommand zcat \ + --runThreadN 5 \ + --outSAMtype BAM Unsorted \ + --outSAMattributes NH HI AS NM MD CR CY UR UY GX GN \ + --outSAMunmapped Within \ + --limitOutSJcollapsed 5000000 \ + --outTmpDir "$tmpdir/STARSolo" + ''' +} + diff --git a/third_party/fetch.sh b/third_party/fetch.sh new file mode 100755 index 00000000..a78ad65d --- /dev/null +++ b/third_party/fetch.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Fetch STAR +wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220601/STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip +unzip STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip +rm STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip From 3858ef6df6ca1ff90edd23a3e6762b4976caaab5 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 25 Jul 2022 16:45:57 -0700 Subject: [PATCH 008/172] Refine altseq.nf and add process_altseq.bash --- modules/bcl2fastq.nf | 26 +- processes/altseq/altseq.nf | 182 ++++++++---- processes/altseq/process_altseq.bash | 87 ++++++ scripts/altseq/upload_fastq.py | 406 +++++++++++++++++++++++++++ 4 files changed, 646 insertions(+), 55 deletions(-) create mode 100644 processes/altseq/process_altseq.bash create mode 100644 scripts/altseq/upload_fastq.py diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf index 6d13ed3c..4e6537bd 100644 --- a/modules/bcl2fastq.nf +++ b/modules/bcl2fastq.nf @@ -9,7 +9,7 @@ params.bcl2fastq_threads = 20 params.bcl2fastq_sample_config_tsv = "" params.bases_mask = "" -params.bcl2fastq_tiles = "*" +params.bcl2fastq_tiles = "s_*" params.samplesheet_header = """[Header] @@ -47,7 +47,7 @@ workflow { BCL2DEMUX( params.input_directory, Channel.from(sample_info), - "*", + params.bcl2fastq_tiles, ) } @@ -91,9 +91,20 @@ workflow BCL2DEMUX { tiles main: - generate_samplesheet( [params.samplesheet_header, sample_info] ) - .map { it -> [ illumina_dir, it, tiles] } - | bcl2fastq + + // Group samples by lanes of processing + // This will let us run 1 bcl2fastq job for each lane + Channel.fromList(sample_info) + | map { [it.lane, it] } + | groupTuple(sort: 'hash') + | map { it[1] } + .set { sample_info_by_lane } + + sample_info_by_lane + | map { [params.samplesheet_header, it] } + | generate_samplesheet + | map { it -> [ illumina_dir, it, tiles] } + | bcl2fastq emit: bcl2fastq.out @@ -115,7 +126,7 @@ process generate_samplesheet { shell: settings = "" - sheet = [ + sheet_parts = [ header, "", "[Settings]", @@ -123,7 +134,8 @@ process generate_samplesheet { "[Data]", "Lane,SampleID,SampleName,index", *sample_info.collect { "${it.lane},${it.name},${it.name},${it.barcode_index}" } - ].join("\n") + ] + sheet = sheet_parts.join("\n") ''' diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 3390d861..cc214a76 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -9,6 +9,7 @@ include { publish_and_rename; publish } from "../../modules/utility.nf" params.sample_config_tsv = "" params.input_directory = "" +params.star_exe = "${workflow.projectDir}/../../third_party/STAR" // Functions @@ -31,8 +32,110 @@ def validate_sample_config(sample_info) { } +workflow ALTSEQ { + + // TODO: This list can probably be refined or repackaged! + take: + genome_dir + genome_fa + barcode_whitelist + tiles + input_dir + sample_info + + main: + + // Run BCL2Fastq on all files + BCL2DEMUX( + input_dir, + sample_info, + tiles, + ) + + BCL2DEMUX.out + | flatMap { it.sort(); it.collate(2) } + | map { [ + sample_info.find(s -> it[0].baseName.startsWith("${s.name}_S") ), + it[0], + it[1], + ]} + | filter { it[0] != null } + | set { fq_files } + + // Merge and publish fastq files + bcl_fq_regex = /(.*)_S[0-9]+_(L[0-9]+)_(R[1-2])_.*/ + BCL2DEMUX.out + | flatten() + | map { + match = (it.baseName =~ bcl_fq_regex)[0]; + [ match[1,3], [match[2], it]] + } + | filter { it[0][0] != "Undetermined" } + | groupTuple + // regroup + | map {[ + "${it[0][0]}_${it[0][1]}", + it[1].sort { a, b -> a[0] <=> b[0] } .collect{ x -> x[1] } + ]} + | view { "sorted $it" } + | merge_fq + | publish + + merge_fq.out + // TODO: Use groupTuple with size:2? + // Would allow alignments to start sooner. + | toSortedList() + | flatMap { it.sort { a, b -> a.baseName <=> b.baseName } ; it.collate(2) } + | map { [ + sample_info.find(s -> it[0].baseName.startsWith("${s.name}_R") ), + it[0], + it[1], + ]} + | filter { it[0] != null } + | set { merged_fq_files } + + // Invoke STAR Solo + align( + genome_dir, + params.star_exe, + barcode_whitelist, + merged_fq_files, + ) + + // Sort the cram files + align.out.aligned_bam + | map { [ + [ + name: it[0].name, + id: it[0].name, + barcode_index: it[0].barcode_index, + lane: it[0].lane + ], + it[1], + genome_fa, + ] } + | sort_and_encode_cram + + // Publish CRAM files. + sort_and_encode_cram.out.cram + | map { ["${it[0].name}.sorted.cram", it[1]] } + | publish_and_rename +} + +workflow { + + def genome_dir = file(params.genome_dir) + def genome_fa = file(params.genome_fa) + def barcode_whitelist = file(params.barcode_whitelist) + + def sample_info = parse_sample_config(file(params.sample_config_tsv).text) + validate_sample_config(sample_info) + + ALTSEQ(genome_dir, genome_fa, barcode_whitelist, "s_*", params.input_directory, sample_info) +} + // test workflow -workflow test { + workflow test { def star_exe = file("${workflow.projectDir}/../../third_party/STAR") def genome_dir = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/") @@ -41,61 +144,21 @@ workflow test { def sample_info = parse_sample_config(file(params.sample_config_tsv).text) validate_sample_config(sample_info) - - BCL2DEMUX( - params.input_directory, - sample_info, - "s_1_1101" - ) - | flatMap { it.sort(); it.collate(2) } - | map { [ - sample_info.find(s -> it[0].baseName.startsWith("${s.name}_S") ), - it[0], - it[1], - ]} - | filter { it[0] != null } - | view { "it is $it" } - | set { fq_files } - - fq_files.flatMap { [ it[1], it[2] ] } | publish - - align( - star_exe, - genome_dir, - barcode_whitelist, - fq_files, - ) - - align.out.aligned_bam - | map { [ - [name: it[0].name, id: it[0].name, barcode_index: it[0].barcode_index, lane: it[0].lane] , - it[1], - genome_fa, - ] } - | sort_and_encode_cram - - sort_and_encode_cram.out.cram - | map { ["${it[0].name}.sorted.cram", it[1]] } - | publish_and_rename + ALTSEQ(genome_dir, genome_fa, barcode_whitelist, "s_[1-4]_1234", params.input_directory, sample_info) } -// TODO once we figure out how it works -workflow ALTSEQ { - - } - process align { memory "108681M" - cpus 5 + cpus cpus input: - file star_exe - file genome_dir - file barcode_whitelist - tuple val(meta), file(fq1), file(fq2) + path genome_dir + path star_exe + path barcode_whitelist + tuple val(meta), path(fq1), path(fq2) output: @@ -103,6 +166,7 @@ process align { tuple val(meta), file("Solo.out"), emit: solo_directory shell: + cpus = 5 ''' tmpdir=$(mktemp -d) "./!{star_exe}" \ @@ -122,7 +186,7 @@ process align { --soloFeatures Gene GeneFull GeneFull_ExonOverIntron GeneFull_Ex50pAS \ --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --readFilesCommand zcat \ - --runThreadN 5 \ + --runThreadN "!{cpus}" \ --outSAMtype BAM Unsorted \ --outSAMattributes NH HI AS NM MD CR CY UR UY GX GN \ --outSAMunmapped Within \ @@ -131,3 +195,25 @@ process align { ''' } + +process merge_fq { + + cpus {cpus} + container null + module "htslib/1.12" + + input: + tuple val(name), path("in.*.fq.gz") + + output: + file out + + shell: + cpus = 10 + out = "${name}.fq.gz" + ''' + zcat in.*.fq.gz \ + | bgzip --stdout --threads "!{cpus}" \ + > "!{out}" + ''' +} diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash new file mode 100644 index 00000000..cd0b353c --- /dev/null +++ b/processes/altseq/process_altseq.bash @@ -0,0 +1,87 @@ +#!/bin/bash +# This script is copied by setup.sh to /net/seq/data2/flowcells/the_flowcell/ + +for var in FLOWCELL SEQUENCER_MOUNT ; do + if [[ -z "$var" ]] ; then + echo "Set env var $var" + exit 2 + fi +done + +set -eo pipefail + +# TODO: Bump this before running for real. +version=0.9.1 + +#cd "$(dirname "$0")" + +outdir="output_$version" +sentinel_file="$outdir/process_complete.txt" + +if [[ -e "$sentinel_file" && -z "$REDO_ALIGNMENT" ]] ; then + echo "Processing already completed, exiting." + echo "To force re-run, set the env var 'REDO_ALIGNMENT=True' or remove $sentinel_file" + exit 0 +fi + +# Dependencies +source "$MODULELOAD" +module purge +module load jdk +module load nextflow +module load python/3.5.1 + +source "$PYTHON3_ACTIVATE" +source "$STAMPIPES/scripts/sentry/sentry-lib.bash" + +# TODO: REDO_ALIGNMENT handling + + +# TODO: +# Prepare for processing +# Needs the flowcell to have a date set! Probably do this manually at the moment. +# lims_put_by_url https://lims-staging.altius.org/api/flowcell_run/2518/prepare_for_processing/ + + +# Set up sample config +sample_config=sample_config.tsv +python "$STAMPIPES"/scripts/lims/get_processing.py -f "$FLOWCELL" +( +echo -e "lane\tname\tbarcode_index" +cat processing.json \ + | jq '.libraries[] | [(.lane | tostring), .barcode1.label_id, .barcode1.reverse_sequence] | join("\t")' -r \ + | sort -u \ +) > "${sample_config}" + + +SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) + +GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ +GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified +BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt + +# Run the pipeline +NXF_VER=21.10.6 nextflow \ + -c ./nextflow.config \ + run "$STAMPIPES"/processes/altseq/altseq.nf \ + -with-trace \ + -with-docker ubuntu \ + -resume \ + --input_directory "$SEQ_DIR" \ + --sample_config_tsv "$sample_config" \ + --genome_dir "$GENOME_DIR" \ + --genome_fa "$GENOME_FA" \ + --barcode_whitelist "$BARCODE_WHITELIST" \ + --outdir "$outdir" \ + -ansi-log false + + +# Upload fastq metadata +python "$STAMPIPES/scripts/altseq/upload_fastq.py" \ + "$sample_config" \ + processing.json \ + --output_file_directory "$outdir" + +if [[ ! -e "$sentinel_file" ]] ; then + echo "{ completed_on: $(date -Iseconds) }" > "$sentinel_file" +fi diff --git a/scripts/altseq/upload_fastq.py b/scripts/altseq/upload_fastq.py new file mode 100644 index 00000000..877c4b1b --- /dev/null +++ b/scripts/altseq/upload_fastq.py @@ -0,0 +1,406 @@ +#pylint disable=invalid-whitespace, invalid-name + + +import re +import csv +import argparse +import datetime +import hashlib +import json +import logging +import os +import sys +import time +from collections import defaultdict + +sys.path.insert( + 1, os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "lims", + "stamlims_api" +)) + +from stamlims_api.lims import aggregations, content_types +from stamlims_api import rest + +lane_tags = None +flowcell_lane_cache = dict() +flowcell_contenttype = None + +log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +log = logging.getLogger('upload_data.py') + +script_options = { + "base_api_url": None, + "basedir": os.getcwd(), + "quiet": False, + "debug": False, + +} + +def parser_setup(): + + parser = argparse.ArgumentParser() + + parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", + help="Don't print info messages to standard out.") + parser.add_argument("-d", "--debug", dest="debug", action="store_true", + help="Print all debug messages to standard out.") + + parser.add_argument("-a", "--api", dest="base_api_url", + help="The base API url, if not the default live LIMS.") + parser.add_argument("-t", "--token", dest="token", + help="Your authentication token.") + + parser.add_argument("sample_config", + help="The sample_config.tsv file") + parser.add_argument("processing_json", + help="The processing.json file") + parser.add_argument("--output_file_directory", default=".") + + + parser.add_argument("--skip_md5_check", dest="skip_md5_check", action="store_true", + help="If file exists and path/size match, don't check md5sum.") + + parser.set_defaults( **script_options ) + parser.set_defaults( quiet=False, debug=False ) + + return parser + + +def md5sum_file(path): + md5sum = hashlib.md5() + + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1024*1024), b''): + md5sum.update(chunk) + + return md5sum.hexdigest() + +def url_join(*args): + url = "/".join([ x.rstrip('/') for x in args ]) + return url + +class UploadLIMS(object): + + def __init__(self, api_url, token): + self.count_types = {} + self.flowcelllane_contenttype = None + self.alignment_contenttype = None + self.aggregation_contenttype = None + self.flowcell_lane_cache = {} + self.api = rest.setup_api({rest.LIMS_URL_OPT_VAR: api_url, + rest.LIMS_TOKEN_OPT_VAR: token, + rest.RAISE_ON_ERROR_VAR: True}) + self.get_cache = {} + + def get(self, url): + if url not in self.get_cache: + self.get_cache[url] = self.api.get_single_result(url) + return self.get_cache[url] + + def get_by_full_url(self, url): + if url not in self.get_cache: + self.get_cache[url] = self.api.get_single_result(url=url) + return self.get_cache[url] + + def get_by_id(self, base_url, id, message=None): + url = "%s/%d/" % (base_url, id) + result = self.get(url) + if not result: + if message is None: + message = "Failed to fetch %s" % url + log.critical(message) + return result + + def get_single_result(self, fetch_url, query=None, field=None): + """ + Using a list API url that should bring up a single item, retrieve that single item if it exists. + """ + result = self.api.get_single_list_result(url_addition=fetch_url, query_arguments=query) + if result is None: + return None + if field is not None: + return result[field] + return result + + def get_list_result(self, url, query=None): + return self.api.get_list_result( + url_addition=url, + query_arguments=query, + item_limit=1000000, + page_size=1000, + ) + + def put(self, *args, **kwargs): + # TODO: s/patch/put/ + return self.api.patch_single_result(*args, **kwargs) + + def post(self, *args, **kwargs): + return self.api.post_single_result(*args, **kwargs) + + def patch(self, *args, **kwargs): + return self.api.patch_single_result(*args, **kwargs) + + def get_flowcell_url_by_label(self, label): + return self.get_single_result('flowcell_run/', + field = 'url', + query={"label":label}) + + def get_contenttype(self, contenttype_name): + """ + Appname uses capitalization, modelname does not. + """ + + (appname, modelname) = contenttype_name.split(".") + + query = { + 'app_label': appname, + 'model': modelname, + } + ct = self.get_single_result('content_type/', query=query) + if not ct: + log.critical("Could not fetch content type %s" % contenttype_name) + + return ct + + def get_file_purpose_url(self, slug): + return self.get_single_result('file_purpose/', + query={"slug": slug}, + field="url") + + def get_file_type(self, slug): + return self.get_single_result('file_type/', + field="url", + query={"slug":slug}) + + + def upload_directory_attachment(self, path, contenttype_name, object_id, file_purpose=None): + path = os.path.abspath(path) + + if not (contenttype_name and object_id): + log.error("Cannot attach file %s without both content type and object_id" % path) + return False + + contenttype = self.get_contenttype(contenttype_name) + + if not contenttype: + log.error("Cannot attach file %s without contenttype result" % path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + + if file_purpose and not purpose: + log.error("Could not find file purpose %s for uploading directory %s" % (file_purpose, path)) + return False + elif purpose: + log.debug("File purpose: %s" % purpose) + + exists = self.get_single_result('directory/', query={"path":path}) + + if exists: + data = exists + else: + data = {} + + data.update({ + 'path': path, + 'content_type': contenttype['url'], + 'object_id': object_id, + 'purpose': purpose + }) + + if exists: + log.info("Updating information for directory %s" % path) + result = self.put(url=data['url'], data=data) + else: + log.info("Uploading information for directory %s" % path) + result = self.post("directory/", data=data) + + if not result: + log.error("Could not upload directory %s" % path) + log.debug(data) + else: + log.debug(result) + + return True + + def upload_file(self, path, contenttype_name, object_ids, file_purpose=None, file_type=None, skip_md5_check=False): + log.info("Gathering data...") + upload_data = self.get_file_upload_data(path, contenttype_name, file_purpose, file_type, skip_md5_check) + log.info("Running md5sum...") + upload_data['md5sum'] = md5sum_file(path) + + content_type_id = re.search("(\d+)/?$", upload_data['content_type']).group(1) + purpose_id = re.search("(\d+)/?$", upload_data['purpose']).group(1) + for object_id in object_ids: + data = {"object_id": object_id, **upload_data} + exists = self.get_single_result("file/", + query={"object_id": object_id, + "purpose": purpose_id, + "content_type": content_type_id}) + + if exists: + log.info("Updating information for file %s: lane %d" % (path, object_id)) + result = self.put(url=exists['url'], data=data) + else: + log.info("Uploading information for file %s: lane %d" % (path, object_id)) + result = self.post("file/", data=data) + + if not result: + log.error("Could not upload file %s for ID %d" % (path, object_id)) + log.debug(data) + else: + log.debug(result) + + + + def get_file_upload_data(self, path, contenttype_name, file_purpose=None, file_type=None, skip_md5_check=False): + path = os.path.abspath(path) + + + contenttype = self.get_contenttype(contenttype_name) + + if not contenttype: + log.error("Cannot attach file %s without contenttype result" % path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + + if file_purpose and not purpose: + log.error("Could not find file purpose %s for uploading file %s" % (file_purpose, path)) + return False + elif purpose: + log.debug("File Purpose: %s" % purpose) + + ftype = self.get_file_type(file_type) + + if file_type and not ftype: + log.error("Could not find file type %s for uploading file %s" % (file_type, path)) + return False + elif purpose: + log.debug("File Type: %s" % ftype) + + + file_size = os.path.getsize(path) + last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + + #if exists: + #recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) + + # TODO: Make time-checking work! + # Current issue: sub-second precision. + data = { + 'path': path, + 'content_type': contenttype["url"], + 'purpose': purpose, + 'filetype': ftype, + 'file_last_modified': last_modified, + 'size_bytes': file_size, + } + + log.debug(data) + return data + + + def get_flowcelllane_contenttype(self): + if not self.flowcelllane_contenttype: + self.flowcelllane_contenttype = self.get_contenttype('SequencingData.flowcelllane') + return self.flowcelllane_contenttype + + def get_flowcell_lane(self, flowcell_lane_id): + return self.get_by_id('flowcell_lane', flowcell_lane_id) + + def get_library(self, library_id): + return self.get_by_id('library', library_id) + + + def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): + # (Filepath, purpose) -> [lane_ids] + files_to_upload = defaultdict(list) + for row in sample_config: + idx = row['barcode_index'] + lane = int(row['lane']) + name = row['name'] + # Get lane IDs for each file + lane_ids = [ + l['id'] + for l in processing_dict['libraries'] + if l['barcode1']['reverse_sequence'] == idx and int(l['lane']) == lane + ] + r1_file = os.path.join(outdir, "%s_R1.fq.gz" % name) + r2_file = os.path.join(outdir, "%s_R2.fq.gz" % name) + if not os.path.exists(r1_file): + raise Exception("No file %s" % r1_file) + if not os.path.exists(r2_file): + raise Exception("No file %s" % r2_file) + + files_to_upload[(r1_file, "r1-fastq")].extend(lane_ids) + files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) + + for ((path, purpose), lane_ids) in files_to_upload.items(): + print(path, purpose, len(lane_ids)) + + self.upload_file(path, + "SequencingData.flowcelllane", + lane_ids, + file_purpose=purpose, + file_type="fastq", + skip_md5_check=False) + + + + + +def main(args = sys.argv): + """This is the main body of the program that by default uses the arguments +from the command line.""" + + parser = parser_setup() + poptions = parser.parse_args() + + if poptions.quiet: + logging.basicConfig(level=logging.WARNING, format=log_format) + elif poptions.debug: + logging.basicConfig(level=logging.DEBUG, format=log_format) + else: + # Set up the default logging levels + logging.basicConfig(level=logging.INFO, format=log_format) + # Make this a little less noisy by default + requests_log = logging.getLogger("requests.packages.urllib3.connectionpool") + requests_log.setLevel(logging.WARN) + + if not poptions.base_api_url and "LIMS_API_URL" in os.environ: + api_url = os.environ["LIMS_API_URL"] + log.debug("Using LIMS API endpoint: %s from environment" % api_url) + elif poptions.base_api_url: + api_url = poptions.base_api_url + log.debug("Using LIMS API endpoint: %s from options" % api_url) + else: + sys.stderr.write("Could not find LIMS API URL.\n") + sys.exit(1) + + + if not poptions.token and "LIMS_API_TOKEN" in os.environ: + token = os.environ["LIMS_API_TOKEN"] + elif poptions.token: + token = poptions.token + else: + sys.stderr.write("Could not find LIMS API TOKEN.\n") + sys.exit(1) + + uploader = UploadLIMS(api_url, token) + + with open(poptions.sample_config) as f: + sample_config = [row for row in csv.DictReader(f, delimiter="\t")] + with open(poptions.processing_json) as f: + processing = json.loads(f.read()) + uploader.upload_altseq_flowcell(sample_config, processing, poptions.output_file_directory) + + +# This is the main body of the program that only runs when running this script +# doesn't run when imported, so you can use the functions above in the shell after importing +# without automatically running it +if __name__ == "__main__": + main() From b636f5511d2fdd4c32b59f359e3a421565644156 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 26 Jul 2022 08:58:35 -0700 Subject: [PATCH 009/172] Connect altseq with LIMS --- modules/bcl2fastq.nf | 2 +- processes/altseq/altseq.nf | 2 +- processes/altseq/process_altseq.bash | 21 +--- scripts/flowcells/setup.sh | 26 ++++ scripts/lims/create_altseq_sample_config.py | 130 ++++++++++++++++++++ 5 files changed, 163 insertions(+), 18 deletions(-) create mode 100644 scripts/lims/create_altseq_sample_config.py diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf index 4e6537bd..f5ec7ad9 100644 --- a/modules/bcl2fastq.nf +++ b/modules/bcl2fastq.nf @@ -98,7 +98,7 @@ workflow BCL2DEMUX { | map { [it.lane, it] } | groupTuple(sort: 'hash') | map { it[1] } - .set { sample_info_by_lane } + | set { sample_info_by_lane } sample_info_by_lane | map { [params.samplesheet_header, it] } diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index cc214a76..bf902a77 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -152,7 +152,7 @@ workflow { process align { memory "108681M" - cpus cpus + cpus {cpus} input: path genome_dir diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index cd0b353c..9620faba 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -13,7 +13,7 @@ set -eo pipefail # TODO: Bump this before running for real. version=0.9.1 -#cd "$(dirname "$0")" +cd "$(dirname "$0")" outdir="output_$version" sentinel_file="$outdir/process_complete.txt" @@ -36,22 +36,11 @@ source "$STAMPIPES/scripts/sentry/sentry-lib.bash" # TODO: REDO_ALIGNMENT handling - -# TODO: -# Prepare for processing -# Needs the flowcell to have a date set! Probably do this manually at the moment. -# lims_put_by_url https://lims-staging.altius.org/api/flowcell_run/2518/prepare_for_processing/ - - # Set up sample config sample_config=sample_config.tsv python "$STAMPIPES"/scripts/lims/get_processing.py -f "$FLOWCELL" -( -echo -e "lane\tname\tbarcode_index" -cat processing.json \ - | jq '.libraries[] | [(.lane | tostring), .barcode1.label_id, .barcode1.reverse_sequence] | join("\t")' -r \ - | sort -u \ -) > "${sample_config}" +python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json --output "$sample_config" + SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) @@ -62,10 +51,10 @@ BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt # Run the pipeline NXF_VER=21.10.6 nextflow \ - -c ./nextflow.config \ + -c $STAMPIPES/nextflow.config \ run "$STAMPIPES"/processes/altseq/altseq.nf \ -with-trace \ - -with-docker ubuntu \ + -profile docker \ -resume \ --input_directory "$SEQ_DIR" \ --sample_config_tsv "$sample_config" \ diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index ef8b8012..61e15305 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -191,6 +191,30 @@ if [[ -z "$nosleep" ]] ; then sleep 300 fi +# Check if read1length=0 -> that means alteseq +# Handle specially +flowcell_data=$(lims_get_all "flowcell_run/?label=$flowcell") +read1length=$(echo $flowcell_data | jq -r .read1_length | head -n1) +if [[ "$read1length" = "0" ]] ; then + echo "Alt-seq run detected" + date=$(echo $flowcell_data | jq -r .date_loaded | sed 's/-//g;s/^20//') + # analysis_dir not set yet, no alignment group + analysis_dir=$FLOWCELLS/FC${flowcell}_${date}_tag + mkdir -p "$analysis_dir" + runscript="$analysis_dir/run.bash" + ( + echo "#!/bin/bash" + echo "export FLOWCELL=$flowcell" + echo "export STAMPIPES=$STAMPIPES" + # TODO: Remove once this data is on staging! + echo "export LIMS_API_URL=https://lims-staging.altius.org/api" + cat "$STAMPIPES"/processes/altseq/process_altseq.bash + ) > "$runscript" + echo "Run $runscript to start analysis!" + + exit 0 +fi + # Get and read the processing script python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" run_type=$( jq -r '.flowcell.run_type' "$json" ) @@ -199,6 +223,8 @@ mask=$( jq -r '.alignment_group.bases_mask' "$json" ) run_type=$( jq -r '.flowcell.run_type' "$json" ) has_umi=$( jq -r '.libraries | map(.barcode1.umi) | any' "$json") + + if [ -z "$demux" ] ; then bcl_mask=$mask mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes) diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py new file mode 100644 index 00000000..241dae4b --- /dev/null +++ b/scripts/lims/create_altseq_sample_config.py @@ -0,0 +1,130 @@ +import json +import os +import sys +import argparse +import logging +import re + +from collections import defaultdict + +sys.path.insert( + 1, os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "lims", + "stamlims_api" +)) + +from stamlims_api import rest + +log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +log = logging.getLogger(__name__) + +rest.DEFAULT_ITEM_LIMIT = 10000 +API = rest.setup_api({rest.RAISE_ON_ERROR_VAR: True}) + + +lib_to_pool = defaultdict(list) + +def parse_json(filename): + with open(filename) as f: + return json.loads(f.read()) + + +def parser_setup(): + parser = argparse.ArgumentParser() + parser.add_argument("data", type=parse_json) + parser.add_argument("--output", default="sample_config.tsv") + return parser + + +def group_data(processing_info): + """ group_data tries to estimate what library pools each library belongs to """ + output = defaultdict(list) + for lib in processing_info['libraries']: + key = (lib['barcode1']['reverse_sequence'], lib['lane']) + output[key].append(lib['library']) + + return output + +def populate_lib_to_pool(): + global lib_to_pool + url_regex = re.compile("(\d+)") + for pool in API.get_list_result( + url="https://lims-staging.altius.org/api/library_pool/", + item_limit=10000): + name = pool['object_name'] + for lib_url in pool['libraries']: + match = url_regex.search(lib_url) + if not match: + raise Exception("lib url %s didn't match" % lib_url) + lib_id = int(match.group(1)) + lib_to_pool[lib_id].append(name) + + + +# Ugly hack lol +def get_pool_for_libs(lib_numbers): + global lib_to_pool + numbers = ",".join(str(n) for n in lib_numbers) + pool = None + url = "https://lims-staging.altius.org/api/library/" + for lib in API.get_list_result( + url=url, + item_limit=200, + query_arguments={"number__in": numbers} + ): + id = int(lib['id']) + assert len(lib_to_pool[id]) == 1 + if pool is None: + pool = lib_to_pool[id][0] + else: + assert lib_to_pool[id][0] == pool + + return pool + + +def to_tsv(label, data): + lines = ["name\tlane\tbarcode_index"] + for (pool, index, lane, _numbers) in sorted(data, key=lambda x: (x[2],x[0])): + lines.append( + "\t".join([label + "_" + pool, str(lane), index]) + ) + return "\n".join(lines) + + +# def create_upload_script(label, data): +# +# lines = ["#!/bin/bash"] +# base = 'python3 "$STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane " +# for (prefix, lane_ids) in data: +# for num in numbers: +# r1 = +# lines.append( +# base + " --attach_file_objectid %d --attach_file %s --attach-file-purpose r1-fastq --attach-file-type fastq" % (num + + +def main(): + poptions = parser_setup().parse_args() + label = poptions.data["flowcell"]["label"] + grouped = group_data(poptions.data) + + populate_lib_to_pool() + + output_data = [] + for group_key, numbers in grouped.items(): + pool = get_pool_for_libs(numbers) + output_data.append( (pool, group_key[0], group_key[1], numbers) ) + + + tsv = to_tsv(label, output_data) + with open(poptions.output, 'w') as f: + f.write(tsv) + + # upload_data = [] + # for library in output_data["libraries"]: + # upload_data.append(library['id'], library['number']) + # upload_script = create_upload_script(label, output_data) + +if __name__ == "__main__": + main() From e1c3ae7d2deb1c119ace83f8ecebbe5ec2fbaeb8 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 2 Aug 2022 16:00:03 -0700 Subject: [PATCH 010/172] altseq script optimizations - better caching --- processes/altseq/altseq.nf | 31 ++++++++++++---------------- processes/altseq/process_altseq.bash | 5 +++-- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index bf902a77..2e075bb1 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -17,7 +17,9 @@ def parse_sample_config(sample_config_tsv) { def data = new CsvParser().parseCsv(sample_config_tsv, separator: "\t") def sample_info = [] for (sample in data) { - sample_info.add( sample ) + sample_info.add( + sample.columns.collectEntries { c, v -> [c, sample[v]] } + ) } return sample_info } @@ -27,7 +29,7 @@ def validate_sample_config(sample_info) { sample_info.collect { assert it.lane > 0 : "Sample has no lane: ${it}" assert it.barcode_index.size() > 0 : "Sample has no barcode index: ${it}" - assert it.name : "Sample has name: ${it}" + assert it.name : "Sample has no name: ${it}" } } @@ -52,16 +54,6 @@ workflow ALTSEQ { tiles, ) - BCL2DEMUX.out - | flatMap { it.sort(); it.collate(2) } - | map { [ - sample_info.find(s -> it[0].baseName.startsWith("${s.name}_S") ), - it[0], - it[1], - ]} - | filter { it[0] != null } - | set { fq_files } - // Merge and publish fastq files bcl_fq_regex = /(.*)_S[0-9]+_(L[0-9]+)_(R[1-2])_.*/ BCL2DEMUX.out @@ -77,20 +69,21 @@ workflow ALTSEQ { "${it[0][0]}_${it[0][1]}", it[1].sort { a, b -> a[0] <=> b[0] } .collect{ x -> x[1] } ]} - | view { "sorted $it" } | merge_fq | publish merge_fq.out - // TODO: Use groupTuple with size:2? - // Would allow alignments to start sooner. - | toSortedList() - | flatMap { it.sort { a, b -> a.baseName <=> b.baseName } ; it.collate(2) } - | map { [ + // Use groupTuple to group files in R1, R2 pairs + | map { [ it.baseName.replaceAll(/_R[12]/, "_RX"), it ] } + | groupTuple(size: 2, sort: { a, b -> { a.baseName <=> b.baseName } } ) + | map { it[1] } + // Re-associate the metadata + | map {[ sample_info.find(s -> it[0].baseName.startsWith("${s.name}_R") ), it[0], it[1], ]} + // Exclude Undetermined files | filter { it[0] != null } | set { merged_fq_files } @@ -145,6 +138,7 @@ workflow { def sample_info = parse_sample_config(file(params.sample_config_tsv).text) validate_sample_config(sample_info) + ALTSEQ(genome_dir, genome_fa, barcode_whitelist, "s_[1-4]_1234", params.input_directory, sample_info) } @@ -153,6 +147,7 @@ process align { memory "108681M" cpus {cpus} + scratch false // Was filling up tmp dirs input: path genome_dir diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 9620faba..d0300960 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -38,7 +38,8 @@ source "$STAMPIPES/scripts/sentry/sentry-lib.bash" # Set up sample config sample_config=sample_config.tsv -python "$STAMPIPES"/scripts/lims/get_processing.py -f "$FLOWCELL" +# TODO: Re-enable after production time-out is fixed +# python "$STAMPIPES"/scripts/lims/get_processing.py -f "$FLOWCELL" python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json --output "$sample_config" @@ -54,7 +55,7 @@ NXF_VER=21.10.6 nextflow \ -c $STAMPIPES/nextflow.config \ run "$STAMPIPES"/processes/altseq/altseq.nf \ -with-trace \ - -profile docker \ + -profile docker,cluster \ -resume \ --input_directory "$SEQ_DIR" \ --sample_config_tsv "$sample_config" \ From 27d61d69f5907710a83ee5bf226c77fd196db681 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 2 Aug 2022 16:00:57 -0700 Subject: [PATCH 011/172] Altseq - version 1.0.0 --- processes/altseq/process_altseq.bash | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index d0300960..4de585a9 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,8 +10,7 @@ done set -eo pipefail -# TODO: Bump this before running for real. -version=0.9.1 +version=1.0.0 cd "$(dirname "$0")" From 2a3b1ec703c3070042d69d7653c87860a440b8a2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 14 Aug 2022 10:39:42 -0700 Subject: [PATCH 012/172] Don't use scratch space for bcl2fastq & merge_fq --- modules/bcl2fastq.nf | 1 + processes/altseq/altseq.nf | 1 + 2 files changed, 2 insertions(+) diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf index f5ec7ad9..391620d2 100644 --- a/modules/bcl2fastq.nf +++ b/modules/bcl2fastq.nf @@ -152,6 +152,7 @@ process bcl2fastq { container "dceoy/bcl2fastq@sha256:6d7233f2160721d6cb62f77a127d499597f4b35bb435cc8265d05f5bf54c7b94" cpus {cpus} + scratch false input: tuple path(illumina_dir), path(samplesheet), val(tiles) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 2e075bb1..6053a589 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -196,6 +196,7 @@ process merge_fq { cpus {cpus} container null module "htslib/1.12" + scratch false input: tuple val(name), path("in.*.fq.gz") From 24c9b4a86f5f2d1f144f07db49c90f2e23eee640 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 14 Aug 2022 10:41:29 -0700 Subject: [PATCH 013/172] Use production LIMS instead of staging --- scripts/flowcells/setup.sh | 2 +- scripts/lims/create_altseq_sample_config.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 61e15305..4e0441ec 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -207,7 +207,7 @@ if [[ "$read1length" = "0" ]] ; then echo "export FLOWCELL=$flowcell" echo "export STAMPIPES=$STAMPIPES" # TODO: Remove once this data is on staging! - echo "export LIMS_API_URL=https://lims-staging.altius.org/api" + echo "export LIMS_API_URL=https://lims.altius.org/api" cat "$STAMPIPES"/processes/altseq/process_altseq.bash ) > "$runscript" echo "Run $runscript to start analysis!" diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index 241dae4b..ae87bd46 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -51,7 +51,7 @@ def populate_lib_to_pool(): global lib_to_pool url_regex = re.compile("(\d+)") for pool in API.get_list_result( - url="https://lims-staging.altius.org/api/library_pool/", + url_addition="library_pool/", item_limit=10000): name = pool['object_name'] for lib_url in pool['libraries']: @@ -68,9 +68,9 @@ def get_pool_for_libs(lib_numbers): global lib_to_pool numbers = ",".join(str(n) for n in lib_numbers) pool = None - url = "https://lims-staging.altius.org/api/library/" + url_addition = "library/" for lib in API.get_list_result( - url=url, + url_addition=url_addition, item_limit=200, query_arguments={"number__in": numbers} ): From 212a21ba0df99a9fa6867225c7d6ed366c735313 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 18 Aug 2022 15:09:07 -0700 Subject: [PATCH 014/172] Altseq - skip running alignment for now We will re-enable this once we get the fastq deadline hit --- processes/altseq/altseq.nf | 57 ++++++++++--------- processes/altseq/process_altseq.bash | 41 +++++++++---- .../{upload_fastq.py => upload_data.py} | 20 ++++--- 3 files changed, 72 insertions(+), 46 deletions(-) rename scripts/altseq/{upload_fastq.py => upload_data.py} (96%) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 6053a589..0005c542 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -11,6 +11,8 @@ params.sample_config_tsv = "" params.input_directory = "" params.star_exe = "${workflow.projectDir}/../../third_party/STAR" +params.skip_alignment = false + // Functions def parse_sample_config(sample_config_tsv) { @@ -87,32 +89,35 @@ workflow ALTSEQ { | filter { it[0] != null } | set { merged_fq_files } - // Invoke STAR Solo - align( - genome_dir, - params.star_exe, - barcode_whitelist, - merged_fq_files, - ) - - // Sort the cram files - align.out.aligned_bam - | map { [ - [ - name: it[0].name, - id: it[0].name, - barcode_index: it[0].barcode_index, - lane: it[0].lane - ], - it[1], - genome_fa, - ] } - | sort_and_encode_cram - - // Publish CRAM files. - sort_and_encode_cram.out.cram - | map { ["${it[0].name}.sorted.cram", it[1]] } - | publish_and_rename + if (!params.skip_alignment) { + + // Invoke STAR Solo + align( + genome_dir, + params.star_exe, + barcode_whitelist, + merged_fq_files, + ) + + // Sort the cram files + align.out.aligned_bam + | map { [ + [ + name: it[0].name, + id: it[0].name, + barcode_index: it[0].barcode_index, + lane: it[0].lane + ], + it[1], + genome_fa, + ] } + | sort_and_encode_cram + + // Publish CRAM files. + sort_and_encode_cram.out.cram + | map { ["${it[0].name}.sorted.cram", it[1]] } + | publish_and_rename + } } workflow { diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 4de585a9..4cb9cf3a 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -15,12 +15,17 @@ version=1.0.0 cd "$(dirname "$0")" outdir="output_$version" -sentinel_file="$outdir/process_complete.txt" +status_file="$outdir/status.json" -if [[ -e "$sentinel_file" && -z "$REDO_ALIGNMENT" ]] ; then - echo "Processing already completed, exiting." - echo "To force re-run, set the env var 'REDO_ALIGNMENT=True' or remove $sentinel_file" - exit 0 +# TODO: improve REDO_ALIGNMENT handling - should we be manually removing the work dir? + +if [[ -e "$status_file" && -z "$REDO_ALIGNMENT" ]] ; then + # Check to see if the alignment is complete + if jq -e '.completed_on' "$status_file" ; then + echo "Processing already completed, exiting." + echo "To force re-run, set the env var 'REDO_ALIGNMENT=True' or remove $status_file" + exit 0 + fi fi # Dependencies @@ -33,8 +38,6 @@ module load python/3.5.1 source "$PYTHON3_ACTIVATE" source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -# TODO: REDO_ALIGNMENT handling - # Set up sample config sample_config=sample_config.tsv # TODO: Re-enable after production time-out is fixed @@ -54,6 +57,7 @@ NXF_VER=21.10.6 nextflow \ -c $STAMPIPES/nextflow.config \ run "$STAMPIPES"/processes/altseq/altseq.nf \ -with-trace \ + -ansi-log false \ -profile docker,cluster \ -resume \ --input_directory "$SEQ_DIR" \ @@ -62,15 +66,30 @@ NXF_VER=21.10.6 nextflow \ --genome_fa "$GENOME_FA" \ --barcode_whitelist "$BARCODE_WHITELIST" \ --outdir "$outdir" \ - -ansi-log false + --skip_alignment # Upload fastq metadata -python "$STAMPIPES/scripts/altseq/upload_fastq.py" \ +python "$STAMPIPES/scripts/altseq/upload_data.py" \ "$sample_config" \ processing.json \ --output_file_directory "$outdir" -if [[ ! -e "$sentinel_file" ]] ; then - echo "{ completed_on: $(date -Iseconds) }" > "$sentinel_file" +# Create sentinel/status file +if [[ -e "$status_file" ]] ; then + old_date=$(jq .completed_on << "$status_file") + old_status_file=${status_file/json/$old_date}.json + mv "$status_file" "$old_status_file" fi + +# TODO: What else do we want to capture here? It would be nice to at least +# capture the command used and relevant env vars +echo | jq . > "$status_file" < Date: Thu, 18 Aug 2022 15:10:01 -0700 Subject: [PATCH 015/172] Altseq - handle pools with same pool barcodes --- scripts/lims/create_altseq_sample_config.py | 41 +++++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index ae87bd46..76b509c6 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -38,12 +38,19 @@ def parser_setup(): return parser -def group_data(processing_info): - """ group_data tries to estimate what library pools each library belongs to """ +def group_data(processing_info) -> dict: + """ + group_data tries to estimate what library pools each library belongs to + Returns dict of tuple keys, values are a list of library numbers + """ output = defaultdict(list) for lib in processing_info['libraries']: - key = (lib['barcode1']['reverse_sequence'], lib['lane']) - output[key].append(lib['library']) + lib_number = lib['library'] + key = ( + lib['barcode1']['reverse_sequence'], + lib['lane'], + ) + output[key].append(lib_number) return output @@ -64,10 +71,13 @@ def populate_lib_to_pool(): # Ugly hack lol -def get_pool_for_libs(lib_numbers): +# TODO: Add pool to processing_info endpoint, then we can remove this. +def get_pools_for_libs(lib_numbers) -> set: + """ Returns dict of {pool: [lib_numbers]} """ global lib_to_pool + pools = set() + numbers = ",".join(str(n) for n in lib_numbers) - pool = None url_addition = "library/" for lib in API.get_list_result( url_addition=url_addition, @@ -75,13 +85,11 @@ def get_pool_for_libs(lib_numbers): query_arguments={"number__in": numbers} ): id = int(lib['id']) - assert len(lib_to_pool[id]) == 1 - if pool is None: - pool = lib_to_pool[id][0] - else: - assert lib_to_pool[id][0] == pool + assert len(lib_to_pool[id]) == 1, "Library LN%d should have exactly one pool" % lib['number'] + pool = lib_to_pool[id][0] + pools.add(pool) - return pool + return pools def to_tsv(label, data): @@ -90,13 +98,13 @@ def to_tsv(label, data): lines.append( "\t".join([label + "_" + pool, str(lane), index]) ) - return "\n".join(lines) + return "\n".join(lines) + "\n" # def create_upload_script(label, data): # -# lines = ["#!/bin/bash"] # base = 'python3 "$STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane " +# lines = ["#!/bin/bash"] # for (prefix, lane_ids) in data: # for num in numbers: # r1 = @@ -113,8 +121,9 @@ def main(): output_data = [] for group_key, numbers in grouped.items(): - pool = get_pool_for_libs(numbers) - output_data.append( (pool, group_key[0], group_key[1], numbers) ) + pools = get_pools_for_libs(numbers) + pool_name = "_and_".join(sorted(pools)) + output_data.append( (pool_name, group_key[0], group_key[1], numbers) ) tsv = to_tsv(label, output_data) From 1341024363376c0045e05428012704d07a8d7f24 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 18 Aug 2022 15:34:24 -0700 Subject: [PATCH 016/172] setup.sh uses processing_information endpoint again --- scripts/flowcells/setup.sh | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 4e0441ec..a6dbdcaa 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -191,40 +191,46 @@ if [[ -z "$nosleep" ]] ; then sleep 300 fi +# Get and read the processing script +python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" +run_type=$( jq -r '.flowcell.run_type' "$json" ) +analysis_dir=$( jq -r '.alignment_group.directory' "$json" ) +mask=$( jq -r '.alignment_group.bases_mask' "$json" ) +run_type=$( jq -r '.flowcell.run_type' "$json" ) +has_umi=$( jq -r '.libraries | map(.barcode1.umi) | any' "$json") + + # Check if read1length=0 -> that means alteseq # Handle specially +# TODO: Check this from processing.json flowcell_data=$(lims_get_all "flowcell_run/?label=$flowcell") read1length=$(echo $flowcell_data | jq -r .read1_length | head -n1) if [[ "$read1length" = "0" ]] ; then echo "Alt-seq run detected" - date=$(echo $flowcell_data | jq -r .date_loaded | sed 's/-//g;s/^20//') - # analysis_dir not set yet, no alignment group - analysis_dir=$FLOWCELLS/FC${flowcell}_${date}_tag mkdir -p "$analysis_dir" + cp processing.json "$analysis_dir/" runscript="$analysis_dir/run.bash" ( echo "#!/bin/bash" echo "export FLOWCELL=$flowcell" echo "export STAMPIPES=$STAMPIPES" - # TODO: Remove once this data is on staging! - echo "export LIMS_API_URL=https://lims.altius.org/api" cat "$STAMPIPES"/processes/altseq/process_altseq.bash ) > "$runscript" + + # Create wrapper for cronjob to call + cat > run_bcl2fastq.sh <<__BCL2FASTQ__ +#!/bin/bash +sbatch --cpus 1 \ + --mem '2G' \ + --partition queue0 \ + --job-name "altseq-$flowcell-supervisor" \ + "$runscript" +__BCL2FASTQ__ echo "Run $runscript to start analysis!" exit 0 fi -# Get and read the processing script -python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" -run_type=$( jq -r '.flowcell.run_type' "$json" ) -analysis_dir=$( jq -r '.alignment_group.directory' "$json" ) -mask=$( jq -r '.alignment_group.bases_mask' "$json" ) -run_type=$( jq -r '.flowcell.run_type' "$json" ) -has_umi=$( jq -r '.libraries | map(.barcode1.umi) | any' "$json") - - - if [ -z "$demux" ] ; then bcl_mask=$mask mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes) From cd3c06af398ee24e93419273e9fced77585acff0 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 21 Aug 2022 10:30:13 -0700 Subject: [PATCH 017/172] fix: encode_cram_no_ref now works again Accidentally duplicated the input specification during a git merge --- modules/cram.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/cram.nf b/modules/cram.nf index 672da60c..cacadf9d 100644 --- a/modules/cram.nf +++ b/modules/cram.nf @@ -137,7 +137,6 @@ process encode_cram_no_ref { input: tuple val(meta), path(input_bam) - tuple val(meta), path(input_bam) output: tuple val(meta), path(output_cram_name), emit: cram From d20c2e19e6819dd7fa00cf49615631ff0a94f3ff Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 4 Sep 2022 16:28:52 -0700 Subject: [PATCH 018/172] fix for altseq setup.sh processing --- scripts/flowcells/setup.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index a6dbdcaa..10aaf44f 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -223,8 +223,11 @@ if [[ "$read1length" = "0" ]] ; then sbatch --cpus 1 \ --mem '2G' \ --partition queue0 \ - --job-name "altseq-$flowcell-supervisor" \ - "$runscript" + --job-name "altseq-$flowcell-supervisor" < Date: Thu, 8 Sep 2022 13:43:16 -0700 Subject: [PATCH 019/172] nextflow_clean script proceeds w/o output symlinks Now that we're regularly copying data over rather than symlinking it, it makes sense to remove the work directory in these cases. --- scripts/utility/nextflow_clean.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/utility/nextflow_clean.sh b/scripts/utility/nextflow_clean.sh index 5811d191..8322ca6b 100755 --- a/scripts/utility/nextflow_clean.sh +++ b/scripts/utility/nextflow_clean.sh @@ -114,7 +114,7 @@ process_dir() { # https://stackoverflow.com/questions/7577052/bash-empty-array-expansion-with-set-u # shellcheck disable=SC2199 [[ -z ${symlinks[@]+"${symlinks[@]}"} ]] && - DIE_F "No input symlinks found. dir=%s" "$dir" + INFO_F "No input symlinks found. dir=%s" "$dir" # 3) for symlink in "${symlinks[@]}" ; do @@ -142,7 +142,14 @@ process_dir() { # 3D) Create hard link to real target, replacing soft link local hardlink=$symlink - RUN_IF_FORCE ln -f --logical "$target" "$hardlink" + #if [[ -d "$target" ]] ; then + # # TODO: This is probably slow, is there a faster way? + # # e.g: remove link, create dir, then hardlink the contents? + # RUN_IF_FORCE rm "$hardlink" + # RUN_IF_FORCE cp -r "$target" "$hardlink" + #else + RUN_IF_FORCE ln -f --logical "$target" "$hardlink" + #fi # Triple-check, make sure the hard link and backup symlink point to the same place if [[ "$FORCE" == TRUE ]] ; then From 69ebe3ceb97dec8e8af21b6b71a83cede944add5 Mon Sep 17 00:00:00 2001 From: solexa Date: Thu, 8 Sep 2022 13:56:05 -0700 Subject: [PATCH 020/172] !fixup c273b879 - missed a simple bug. Actually tested this time. --- scripts/utility/nextflow_clean.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utility/nextflow_clean.sh b/scripts/utility/nextflow_clean.sh index 8322ca6b..1ae5ab83 100755 --- a/scripts/utility/nextflow_clean.sh +++ b/scripts/utility/nextflow_clean.sh @@ -117,7 +117,7 @@ process_dir() { INFO_F "No input symlinks found. dir=%s" "$dir" # 3) - for symlink in "${symlinks[@]}" ; do + for symlink in ${symlinks[@]+"${symlinks[@]}"} ; do # 3A) Get symlink target local target target=$(readlink -f "$symlink" || true) From fb8bd404eb28dd7485be0b0c487639c56efc2216 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 19 Oct 2022 15:10:34 -0700 Subject: [PATCH 021/172] Improve alignprocess.py error logging Make it more obvious when and where an alignment cannot be set up --- scripts/alignprocess.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index ac7826e6..4e2ed25c 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -184,17 +184,27 @@ def get_process_template(self, align_id, process_template_id): # Run alignment setup in parallel def setup_alignments(self, align_ids): - self.pool.map(self.setup_alignment, align_ids) + for id, error in self.pool.map(self.setup_alignment, align_ids): + if error: + logging.debug(f"ALN{id} result received, error: {error}") + else: + logging.debug(f"ALN{id} result received, OK") def setup_alignment(self, align_id): - processing_info = self.get_align_process_info(align_id) - alignment = self.api_single_result("flowcell_lane_alignment/%d/" % (align_id)) + try: + processing_info = self.get_align_process_info(align_id) + alignment = self.api_single_result("flowcell_lane_alignment/%d/" % (align_id)) - if self.redo_completed or not alignment['complete_time']: - self.create_script(processing_info, alignment["id"]) - else: - logging.info("Skipping completed alignment %d" % align_id) + if self.redo_completed or not alignment['complete_time']: + self.create_script(processing_info, alignment["id"]) + return (align_id, None) + else: + logging.info("Skipping completed alignment %d" % align_id) + return (align_id, None) + except Exception as e: + logging.exception(f"Could not set up alignment {align_id}: ({e})") + return (align_id, e) def get_lane_file(self, lane_id, purpose): candidates = self.api_list_result("file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id)) @@ -331,7 +341,7 @@ def create_script(self, processing_info, align_id): return False script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) - logging.info(script_file) + logging.info(f"Will write to {script_file}") # Set up & add environment variables From 85b565fd2b8b3c08fc5d6d65c688ac3f0fa1c052 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 19 Oct 2022 15:11:07 -0700 Subject: [PATCH 022/172] Fix alignprocess.py when library_kit_method=null With this fix, we should set `unset LIBRARY_KIT_METHOD` correctly in our bash scripts --- scripts/alignprocess.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 4e2ed25c..7116bd60 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -352,7 +352,10 @@ def create_script(self, processing_info, align_id): env_vars["GENOME"] = alignment['genome_index'] env_vars["ASSAY"] = lane['assay'] env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] - env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' + if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0]['library_kit_method']: + env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' + else: + env_vars["LIBRARY_KIT"] = None if processing_info['flowcell']['paired_end']: env_vars["PAIRED"] = "True" From b2cdab7aff2684637ddb4d51e093e3a1212e887f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 19 Oct 2022 15:18:36 -0700 Subject: [PATCH 023/172] fixup: Can't use f-strings in current python ver --- scripts/alignprocess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 7116bd60..3c32b48e 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -186,9 +186,9 @@ def get_process_template(self, align_id, process_template_id): def setup_alignments(self, align_ids): for id, error in self.pool.map(self.setup_alignment, align_ids): if error: - logging.debug(f"ALN{id} result received, error: {error}") + logging.debug("ALN%d result received, error: %s" % (id, error)) else: - logging.debug(f"ALN{id} result received, OK") + logging.debug("ALN%d result received, OK" % id) def setup_alignment(self, align_id): @@ -203,7 +203,7 @@ def setup_alignment(self, align_id): logging.info("Skipping completed alignment %d" % align_id) return (align_id, None) except Exception as e: - logging.exception(f"Could not set up alignment {align_id}: ({e})") + logging.exception("Could not set up alignment %d}: (%s)" % (align_id, e)) return (align_id, e) def get_lane_file(self, lane_id, purpose): @@ -341,7 +341,7 @@ def create_script(self, processing_info, align_id): return False script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) - logging.info(f"Will write to {script_file}") + logging.info("Will write to %s" % script_file) # Set up & add environment variables From 0248da830077695528bd38b55a60605050a6b5f0 Mon Sep 17 00:00:00 2001 From: solexa Date: Sun, 20 Nov 2022 10:03:03 -0800 Subject: [PATCH 024/172] Config: Add 137 to retry-with-more-mem exit codes --- nextflow.config | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index ef12186d..72e0de28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,11 +21,11 @@ profiles { memory = { 1.GB * task.attempt } } withLabel: 'high_mem' { - memory = { 32.GB * task.attempt } + memory = { 32.GB * (2**(task.attempt - 1)) } } // clusterOptions: "--nodeList hpcA04,hpcA05,hpcA08,hpcA09,hpcA10,hpcA11" - clusterOptions: "--exclude hpcA13" + clusterOptions: "--exclude hpcz-0007" } } @@ -33,22 +33,42 @@ profiles { process { executor = 'slurm' queue = 'hpcz-2' - errorStrategy = { task.exitStatus == 143 ? 'retry' : 'terminate' } + errorStrategy = { task.exitStatus in [137, 143] ? 'retry' : 'terminate' } + //errorStrategy = { task.exitStatus in [137, 143] ? 'retry' : 'finish' } //errorStrategy 'retry' - maxRetries = 3 + maxRetries = 4 memory = { 8.GB * task.attempt } - scratch = true + //clusterOptions: "--exclude hpcz-0009" + //clusterOptions: "--exclude hpcz-0041,hpcz-0042" + // Hopefully causes nextflow to spread jobs out more + // Unclear if this actually anything though, the way we're using it + //clusterOptions: "--distribution=cyclic --exclude=hpcz-0032" withLabel: 'low_mem' { memory = { 1.GB * task.attempt } } withLabel: 'high_mem' { - memory = { 32.GB * task.attempt } + // Doubles each time + memory = { 48.GB * (2**(task.attempt - 1)) } } + + //clusterOptions: "--exclude hpcz-0035,hpcz-0040" } } + ignore_process_failures { + //process.errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' } + process.errorStrategy = 'ignore' + } + + pipefail { + // Untested, but should give better resiliency to shell scripts we run + // The risk is that we have some 'allowable' pipe failures that will cause our jobs to fail when they shouldn't + // (e.g: `process1 | head | ...` may cause process1 to exit with error code) + process.shell = ['/bin/bash', '-ueo','pipefail' ] + } + debug { process.scratch = false } From 4760b6d4d43c95089ef3901f6c465423f347a8ba Mon Sep 17 00:00:00 2001 From: solexa Date: Sun, 20 Nov 2022 10:04:29 -0800 Subject: [PATCH 025/172] fix/rna-agg: two typos in anaquin processing --- processes/rna-star/aggregation/cufflinks_featurecounts.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/rna-star/aggregation/cufflinks_featurecounts.nf b/processes/rna-star/aggregation/cufflinks_featurecounts.nf index 2ba20886..a7506ae5 100644 --- a/processes/rna-star/aggregation/cufflinks_featurecounts.nf +++ b/processes/rna-star/aggregation/cufflinks_featurecounts.nf @@ -432,7 +432,7 @@ process anaquin { script: dilution = 0.0001 collate_threads = 2 - fastq_threds = 2 + fastq_threads = 2 """ anaquin RnaAlign -rgtf "${sequins_ref}" -usequin "${input_bam}" -o anaquin_star bash \$STAMPIPES/scripts/rna-star/aggregate/anaquin_rnaalign_stats.bash anaquin_star/RnaAlign_summary.stats anaquin_star/RnaAlign_summary.stats.info @@ -452,7 +452,7 @@ process anaquin { --threads "${collate_threads}" \ temp_subsample.bam \ tmp.collate \ - | samtools fastq -1 subsample.fq1 -2 subsample.fq1 -0 /dev/null -s /dev/null \ + | samtools fastq -1 subsample.fq1 -2 subsample.fq2 -0 /dev/null -s /dev/null \ -n --threads "${fastq_threads}" # call kallisto on subsampled fastqs From 4cbfafcb7e379c495f4374d63fc7622a2531a8f4 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 4 Dec 2022 16:44:27 -0800 Subject: [PATCH 026/172] altseq - use better publishing strategy --- processes/altseq/altseq.nf | 7 +++---- processes/altseq/nextflow.config | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 processes/altseq/nextflow.config diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 0005c542..d0ec539f 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -5,12 +5,14 @@ import com.xlson.groovycsv.CsvParser include { BCL2DEMUX } from "../../modules/bcl2fastq.nf" include { sort_and_encode_cram } from "../../modules/cram.nf" -include { publish_and_rename; publish } from "../../modules/utility.nf" params.sample_config_tsv = "" params.input_directory = "" params.star_exe = "${workflow.projectDir}/../../third_party/STAR" +params.outdir = "output" +params.publishmode = "link" + params.skip_alignment = false @@ -72,7 +74,6 @@ workflow ALTSEQ { it[1].sort { a, b -> a[0] <=> b[0] } .collect{ x -> x[1] } ]} | merge_fq - | publish merge_fq.out // Use groupTuple to group files in R1, R2 pairs @@ -115,8 +116,6 @@ workflow ALTSEQ { // Publish CRAM files. sort_and_encode_cram.out.cram - | map { ["${it[0].name}.sorted.cram", it[1]] } - | publish_and_rename } } diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config new file mode 100644 index 00000000..028a5203 --- /dev/null +++ b/processes/altseq/nextflow.config @@ -0,0 +1,33 @@ +includeConfig "../../nextflow.config" + +process { + // Configure publishing directives + + // merged fastq is published (for now) + withName : "merge_fq" { + publishDir = [ + path: { params.outdir }, + mode: "link", + ] + } + + // Rename cram files to match the input + // Uses the 'meta.name' value inside the process + withName: "sort_and_encode_cram" { + publishDir = [ + path: { params.outdir }, + mode: "link", + saveAs: { f -> "${meta.name}.sorted.cram" }, + ] + } +} + +profiles { + modules { + process { + withName: ".*:BCL2DEMUX:bcl2fastq.*" { + module = "bcl2fastq2/2.20.0.422" + } + } + } +} From d70fdc365bd459f16299936c96fac5bc31392cc5 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 8 Dec 2022 16:31:45 -0800 Subject: [PATCH 027/172] Add basic analysis --- .gitignore | 5 +++- nextflow.config | 6 ++++- processes/altseq/altseq.nf | 29 ++++++++++++++++----- processes/altseq/bin/matrix2csv.sh | 42 ++++++++++++++++++++++++++++++ processes/altseq/nextflow.config | 23 +++++++++++++++- 5 files changed, 96 insertions(+), 9 deletions(-) create mode 100755 processes/altseq/bin/matrix2csv.sh diff --git a/.gitignore b/.gitignore index 905b2ed1..1f03248a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,10 @@ environments/ **/.nextflow* **/*html* -**/*trace.txt* +**/trace*txt* +**/report*txt* + +**/dag*dot* *.fq diff --git a/nextflow.config b/nextflow.config index 72e0de28..61343c28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -85,6 +85,11 @@ profiles { temp = 'auto' } } + apptainer { + singularity { + enabled = true + } + } } report { @@ -98,5 +103,4 @@ timeline { } dag { enabled = true - file = "dag.html" } diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index d0ec539f..deef6caf 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -100,6 +100,10 @@ workflow ALTSEQ { merged_fq_files, ) + // "Analyze" the results + + align.out.solo_directory | analyze_solo_dir + // Sort the cram files align.out.aligned_bam | map { [ @@ -113,9 +117,6 @@ workflow ALTSEQ { genome_fa, ] } | sort_and_encode_cram - - // Publish CRAM files. - sort_and_encode_cram.out.cram } } @@ -212,8 +213,24 @@ process merge_fq { cpus = 10 out = "${name}.fq.gz" ''' - zcat in.*.fq.gz \ - | bgzip --stdout --threads "!{cpus}" \ - > "!{out}" + zcat in.*.fq.gz \ + | bgzip --stdout --threads "!{cpus}" \ + > "!{out}" + ''' +} + +process analyze_solo_dir { + input: + tuple val(meta), path("Solo.out") + + output: + tuple val(meta), file("output") + + shell: + ''' + for dir in Gene GeneFull GeneFull_Ex50pAS GeneFull_ExonOverIntron ; do + mkdir -p "output/$dir" + bash -x matrix2csv.sh "Solo.out/$dir/filtered/" > "output/$dir/counts.csv" + done ''' } diff --git a/processes/altseq/bin/matrix2csv.sh b/processes/altseq/bin/matrix2csv.sh new file mode 100755 index 00000000..52754846 --- /dev/null +++ b/processes/altseq/bin/matrix2csv.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# From: https://kb.10xgenomics.com/hc/en-us/articles/360023793031-How-can-I-convert-the-feature-barcode-matrix-from-Cell-Ranger-3-to-a-CSV-file- + +dir=$(readlink -f "${1:-$PWD}") + +die() { + echo "$@" + exit 1 +} + +testfiles() { + for f in "$@" ; do + [[ -s $f ]] || die "file $f does not exist" + done +} + +barcodes=$dir/barcodes.tsv +features=$dir/features.tsv +matrix=$dir/matrix.mtx + + +testfiles "$barcodes" "$features" "$matrix" + +tmpdir=$(mktemp -d) +{ + set -e + cd "$tmpdir" + + # Print line number along with contents of barcodes.tsv.gz and genes.tsv.gz + < "$barcodes" awk -F "\t" 'BEGIN { OFS = "," }; {print NR,$1}' | sort -t, -k 1b,1 > numbered_barcodes.csv + < "$features" awk -F "\t" 'BEGIN { OFS = "," }; {print NR,$1,$2,$3}' | sort -t, -k 1b,1 > numbered_features.csv + + # Skip the header lines and sort matrix.mtx.gz + < "$matrix" tail -n +4 | awk -F " " 'BEGIN { OFS = "," }; {print $1,$2,$3}' | sort -t, -k 1b,1 > feature_sorted_matrix.csv + < "$matrix" tail -n +4 | awk -F " " 'BEGIN { OFS = "," }; {print $1,$2,$3}' | sort -t, -k 2b,2 > barcode_sorted_matrix.csv + + # Use join to replace line number with barcodes and genes + # Writes to stdout + join -t, -1 1 -2 1 numbered_features.csv feature_sorted_matrix.csv | cut -d, -f 2,3,4,5,6 | sort -t, -k 4b,4 | join -t, -1 1 -2 4 numbered_barcodes.csv - | cut -d, -f 2,3,4,5,6 +} +rm -rf "$tmpdir" diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index 028a5203..c76ae982 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -3,11 +3,14 @@ includeConfig "../../nextflow.config" process { // Configure publishing directives + // Convention: All files are saved under ${params.outdir}/${meta.name} + // merged fastq is published (for now) withName : "merge_fq" { publishDir = [ path: { params.outdir }, mode: "link", + saveAs: { f -> f.replace("_R","/R") } // okay this feels really fragile. ] } @@ -17,9 +20,27 @@ process { publishDir = [ path: { params.outdir }, mode: "link", - saveAs: { f -> "${meta.name}.sorted.cram" }, + saveAs: { f -> "${meta.name}/sorted.cram" }, ] } + + // StarSOLO + withName: "align" { + publishDir = [ + path: { params.outdir }, + mode: "link", + saveAs: { f -> f == "Solo.out" ? "${meta.name}/Solo.out" : null } + ] + } + + // StarSOLO analysis + withName: "analyze_solo_dir" { + publishDir = [ + path: { params.outdir }, + mode: "link", + saveAs: { f -> f == "output" ? "${meta.name}/analysis" : null } + ] + } } profiles { From df9c611de068d181add68a9254da086eb031dcae Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 12 Dec 2022 12:33:02 -0800 Subject: [PATCH 028/172] Update sample_config to new format --- scripts/lims/create_altseq_sample_config.py | 117 ++++++-------------- 1 file changed, 34 insertions(+), 83 deletions(-) diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index 76b509c6..0d7fd24b 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -7,25 +7,9 @@ from collections import defaultdict -sys.path.insert( - 1, os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "..", - "lims", - "stamlims_api" -)) - -from stamlims_api import rest - log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log = logging.getLogger(__name__) -rest.DEFAULT_ITEM_LIMIT = 10000 -API = rest.setup_api({rest.RAISE_ON_ERROR_VAR: True}) - - -lib_to_pool = defaultdict(list) - def parse_json(filename): with open(filename) as f: return json.loads(f.read()) @@ -54,86 +38,53 @@ def group_data(processing_info) -> dict: return output -def populate_lib_to_pool(): - global lib_to_pool - url_regex = re.compile("(\d+)") - for pool in API.get_list_result( - url_addition="library_pool/", - item_limit=10000): - name = pool['object_name'] - for lib_url in pool['libraries']: - match = url_regex.search(lib_url) - if not match: - raise Exception("lib url %s didn't match" % lib_url) - lib_id = int(match.group(1)) - lib_to_pool[lib_id].append(name) - - - -# Ugly hack lol -# TODO: Add pool to processing_info endpoint, then we can remove this. -def get_pools_for_libs(lib_numbers) -> set: - """ Returns dict of {pool: [lib_numbers]} """ - global lib_to_pool - pools = set() - - numbers = ",".join(str(n) for n in lib_numbers) - url_addition = "library/" - for lib in API.get_list_result( - url_addition=url_addition, - item_limit=200, - query_arguments={"number__in": numbers} - ): - id = int(lib['id']) - assert len(lib_to_pool[id]) == 1, "Library LN%d should have exactly one pool" % lib['number'] - pool = lib_to_pool[id][0] - pools.add(pool) - - return pools - def to_tsv(label, data): - lines = ["name\tlane\tbarcode_index"] - for (pool, index, lane, _numbers) in sorted(data, key=lambda x: (x[2],x[0])): - lines.append( - "\t".join([label + "_" + pool, str(lane), index]) - ) + lines = ["pool_name\tsample_name\tlane\tbarcode_index"] + for datum in sorted(data, key=lambda d: (d['lane'], d['pool_name'], d['sample_name'])): + lines.append("\t".join([ + label + "_" + datum["pool_name"], + datum["sample_name"], + str(datum["lane"]), + datum["barcode_index"], + ] )) return "\n".join(lines) + "\n" - -# def create_upload_script(label, data): -# -# base = 'python3 "$STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane " -# lines = ["#!/bin/bash"] -# for (prefix, lane_ids) in data: -# for num in numbers: -# r1 = -# lines.append( -# base + " --attach_file_objectid %d --attach_file %s --attach-file-purpose r1-fastq --attach-file-type fastq" % (num +def get_config_info(processing_data, ds_number: int): + pass + +def construct_config_entries(data: dict) -> [dict]: + # Maps library number -> (pool_name, barcode1) + pool_lookup_table = {} + for (pool, values) in data["library_pools"].items(): + value = (pool, values["barcode1"]) + for lib_str in values['libraries']: + lib_num = int(lib_str.replace("LN", "")) # Discard the 'LN' prefix + if lib_num in pool_lookup_table: + raise ValueError("Libnum in more than one pool, %s and %s" % (pool_lookup_table[lib_num], value)) + pool_lookup_table[lib_num] = pool + + results = [] + for (library) in data['libraries']: + datum = { + "barcode_index" : library["barcode_index"], + "sample_name" : library["samplesheet_name"], + "pool_name" : pool_lookup_table[library["library"]], + "lane" : library["lane"], + } + results.append(datum) + return results def main(): poptions = parser_setup().parse_args() label = poptions.data["flowcell"]["label"] - grouped = group_data(poptions.data) - - populate_lib_to_pool() - output_data = [] - for group_key, numbers in grouped.items(): - pools = get_pools_for_libs(numbers) - pool_name = "_and_".join(sorted(pools)) - output_data.append( (pool_name, group_key[0], group_key[1], numbers) ) + entries = construct_config_entries(poptions.data) - - tsv = to_tsv(label, output_data) + tsv = to_tsv(label, entries) with open(poptions.output, 'w') as f: f.write(tsv) - # upload_data = [] - # for library in output_data["libraries"]: - # upload_data.append(library['id'], library['number']) - # upload_script = create_upload_script(label, output_data) - if __name__ == "__main__": main() From e6c29d7a599ca07a2f2e1e138611019b69c95041 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 12 Dec 2022 12:33:25 -0800 Subject: [PATCH 029/172] Add altseq config example template to show usage --- processes/altseq/sample_config_template.tsv | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 processes/altseq/sample_config_template.tsv diff --git a/processes/altseq/sample_config_template.tsv b/processes/altseq/sample_config_template.tsv new file mode 100644 index 00000000..0c5cd697 --- /dev/null +++ b/processes/altseq/sample_config_template.tsv @@ -0,0 +1,13 @@ +pool_name sample_name lane barcode_index +H00000SX5_LP1000 DS10000 1 GGACTCCT-CACCGTGGATCA +H00000SX5_LP1000 DS10001 1 GGACTCCT-TCGGTTCGCTCA +H00000SX5_LP1000 DS10002 1 GGACTCCT-ATGTTGGAGCTA +H00000SX5_LP1001 DS20000 1 TCCTGAGC-TTCCAGTGTCTG +H00000SX5_LP1001 DS20001 1 TCCTGAGC-CACCGtGGATCA +H00000SX5_LP1001 DS20002 1 TCCTGAGC-ATGTTGGAGCTA +H00000SX5_LP1002 DS30000 2 GGACTCCT-CACCGTGGATCA +H00000SX5_LP1002 DS30001 2 GGACTCCT-TCGGTTCGCTCA +H00000SX5_LP1002 DS30002 2 GGACTCCT-ATGTTGGAGCTA +H00000SX5_LP1003 DS40000 2 TCCTGAGC-TTCCAGTGTCTG +H00000SX5_LP1003 DS40001 2 TCCTGAGC-CACCGtGGATCA +H00000SX5_LP1003 DS40002 2 TCCTGAGC-ATGTTGGAGCTA From 210e4c69e4868384b95c928a6c58a3af49856f9d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 14 Dec 2022 10:29:20 -0800 Subject: [PATCH 030/172] Changes toward altseq per-sample stats --- modules/bcl2fastq.nf | 9 +-- processes/altseq/README.md | 66 ++++++++++++++++++ processes/altseq/altseq.nf | 113 ++++++++++++++++++++++++------- processes/altseq/bin/analyze.py | 65 ++++++++++++++++++ processes/altseq/nextflow.config | 10 +++ 5 files changed, 236 insertions(+), 27 deletions(-) create mode 100644 processes/altseq/README.md create mode 100755 processes/altseq/bin/analyze.py diff --git a/modules/bcl2fastq.nf b/modules/bcl2fastq.nf index 391620d2..e4a657e5 100644 --- a/modules/bcl2fastq.nf +++ b/modules/bcl2fastq.nf @@ -23,9 +23,6 @@ def parse_sample_config(sample_config_tsv) { for (sample in data) { sample_info.add( sample ) } - for (s in sample_info) { - println "sample is ${s}" - } return sample_info } @@ -117,6 +114,7 @@ workflow BCL2DEMUX { // } process generate_samplesheet { + executor = "local" input: tuple val(header), val(sample_info) @@ -124,6 +122,10 @@ process generate_samplesheet { output: file("Samplesheet.csv") + // TODO: This is silly. We create a bash script to just + // write a file. However, despite my best efforts, I + // haven't figured out how to get this to work as an + // `exec` block. shell: settings = "" sheet_parts = [ @@ -137,7 +139,6 @@ process generate_samplesheet { ] sheet = sheet_parts.join("\n") - ''' echo '!{sheet}' > Samplesheet.csv ''' diff --git a/processes/altseq/README.md b/processes/altseq/README.md new file mode 100644 index 00000000..ecaeb6a8 --- /dev/null +++ b/processes/altseq/README.md @@ -0,0 +1,66 @@ +# Description + +This process implements our Alt-seq processing pipeline + +## Usage: + +TODO + +## Developing + +TODO + +## Notes + +We use StarSOLO, which is like CellRanger. Designed for single-cell analysis, we use it by telling the software that the barcode internal to the pool is a "cell barcode." All references by STAR and output files to "cell barcodes" actually refer to individual library barcodes. That is, each "cell barcode" corresponds to a DSnumber and LNnumber. + +### CellReads.stats interpretation + +The meaning of `CellReads.stats` columns is not obvious, but it is documented in a github issue [here](https://github.com/alexdobin/STAR/issues/1501). +For convenience, I've collected the answers below: + +Column descriptions: + +`CB` +: cell barcode +`cbMatch` +: number of reads that matched the cell barcode +`cbPerfect` +: number of perfect match on cell barcode +`exonic` +: number of reads mapping on exonic (only for `GeneFull_Ex50pAS` and `GeneFull_ExonOverIntron`) +`intronic` +: number of reads mapping on intronic (only for `GeneFull_Ex50pAS` and `GeneFull_ExonOverIntron`) +`mito` +: number of reads mapping on mitochondrial genome +`genomeU` +: number of reads mapping to one locus in the genome +`genomeM` +: number of reads mapping to multiple loci in the genome +`featureU` +: number of reads mapping to one feature (Gene, GeneFull, etc) +`featureM` +: number of reads mapping to multiple features +`cbMMunique` +: number of reads with cell barcodes that map with mismatches to one barcode in the passlist +`cbMMmultiple` +: number of reads with cell barcodes that map with mismatches to multiple barcodes in the passlist +`exonicAS` +: number of reads mapping antisense to annotated exons (only for `GeneFull_Ex50pAS`) +`intronicAS` +: number of reads mapping antisense to annotated introns (only for `GeneFull_Ex50pAS`) +`countedU` +: number of unique-gene reads that were used in counting UMIs (!= number of UMIs), i.e. reads with valid CB/UMI/gene +`countedM` +: number of multi-gene reads that were used in counting UMIs (!= number of UMIs), i.e. reads with valid CB/UMI/gene +`nUMIunique` +: total number of counted UMI +`nGenesUnique` +: number of genes having non 0 counts +`nUMImulti` +: number of UMI for multi-gene reads, if requested +`nGenesMulti` +: number of genes supported by just multi-gene reads, if requested + +NB: +> All columns are read counts, except for CB and the last 4: nUMIunique nGenesUnique nUMImulti nGenesMulti diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index deef6caf..84816942 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -33,7 +33,8 @@ def validate_sample_config(sample_info) { sample_info.collect { assert it.lane > 0 : "Sample has no lane: ${it}" assert it.barcode_index.size() > 0 : "Sample has no barcode index: ${it}" - assert it.name : "Sample has no name: ${it}" + assert it.pool_name : "Sample has no pool name: ${it}" + assert it.sample_name : "Sample has no sample name: ${it}" } } @@ -51,27 +52,45 @@ workflow ALTSEQ { main: + // We demux at the library pool level. + // So here, we do some work to get just those pools + // We take the pool_name and just the first part of the barcode_index (the part before '-') + sample_info_for_bcl2fastq = sample_info.collect { info -> [ + name: info.pool_name, + lane: info.lane, + barcode_index: info.barcode_index.split('-')[0], + ]}.unique() // And filter to just the unique ones + // Run BCL2Fastq on all files BCL2DEMUX( input_dir, - sample_info, + sample_info_for_bcl2fastq, tiles, ) // Merge and publish fastq files - bcl_fq_regex = /(.*)_S[0-9]+_(L[0-9]+)_(R[1-2])_.*/ BCL2DEMUX.out | flatten() - | map { - match = (it.baseName =~ bcl_fq_regex)[0]; - [ match[1,3], [match[2], it]] + | map { fq_file -> + // Extract the pool name, R1/R2, and the lane + // output is shaped like [[prefix, read], [lane, file]] + bcl_fq_regex = /(.*)_S[0-9]+_(L[0-9]+)_(R[1-2])_.*/ + match = (fq_file.baseName =~ bcl_fq_regex)[0] + [ match[1,3], [match[2], fq_file]] } - | filter { it[0][0] != "Undetermined" } + | filter { + // We don't want to do further processing on Undetermined samples + it[0][0] != "Undetermined" + } + // Now we group it together by pool name, lane, and read | groupTuple - // regroup - | map {[ - "${it[0][0]}_${it[0][1]}", - it[1].sort { a, b -> a[0] <=> b[0] } .collect{ x -> x[1] } + | map { + readname, files -> [ + // Convert readname to string + "${readname[0]}_${readname[1]}", + //Make sure files are in order by lane + files.sort { lane, filename -> lane <=> lane } + .collect { lane, filename -> filename } ]} | merge_fq @@ -79,15 +98,14 @@ workflow ALTSEQ { // Use groupTuple to group files in R1, R2 pairs | map { [ it.baseName.replaceAll(/_R[12]/, "_RX"), it ] } | groupTuple(size: 2, sort: { a, b -> { a.baseName <=> b.baseName } } ) - | map { it[1] } + // drop prefix, no longer needed + | map { prefix, info -> info } // Re-associate the metadata - | map {[ - sample_info.find(s -> it[0].baseName.startsWith("${s.name}_R") ), - it[0], - it[1], + | map {r1, r2 -> [ + sample_info_for_bcl2fastq.find(s -> r1.baseName.startsWith("${s.name}_R") ), + r1, + r2, ]} - // Exclude Undetermined files - | filter { it[0] != null } | set { merged_fq_files } if (!params.skip_alignment) { @@ -102,7 +120,20 @@ workflow ALTSEQ { // "Analyze" the results - align.out.solo_directory | analyze_solo_dir + // First, we pair up the analysis with the expected list of samples + // (This key will help us decode pool/barcode -> sample) + create_sample_configs(params.sample_config_tsv) + | flatten() + | map { fn -> [fn.baseName, fn] } + | set { per_pool_sample_configs } + + align.out.solo_directory + | map { meta, solo_dir -> ["${meta.name}_lane${meta.lane}", meta, solo_dir ] } + | join(per_pool_sample_configs) + | map { key, meta, solodir, config -> [meta, config, solodir] } + | set {to_analyze} + + analyze_solo_dir(to_analyze) // Sort the cram files align.out.aligned_bam @@ -118,6 +149,11 @@ workflow ALTSEQ { ] } | sort_and_encode_cram } + + // Debugging section - use `nextflow run -dump-channels` to write channel contents to terminal + merged_fq_files.dump(tag: "merged_fq_files", pretty: true) + per_pool_sample_configs.dump(tag: "per_pool", pretty: true) + to_analyze.dump(tag: "to_analyze", pretty: true) } workflow { @@ -134,6 +170,7 @@ workflow { // test workflow workflow test { + println "Running test workflow..." def star_exe = file("${workflow.projectDir}/../../third_party/STAR") def genome_dir = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/") @@ -143,7 +180,6 @@ workflow { def sample_info = parse_sample_config(file(params.sample_config_tsv).text) validate_sample_config(sample_info) - ALTSEQ(genome_dir, genome_fa, barcode_whitelist, "s_[1-4]_1234", params.input_directory, sample_info) } @@ -153,6 +189,7 @@ process align { memory "108681M" cpus {cpus} scratch false // Was filling up tmp dirs + tag "${meta.name}" input: path genome_dir @@ -220,17 +257,47 @@ process merge_fq { } process analyze_solo_dir { + scratch false + input: - tuple val(meta), path("Solo.out") + tuple val(meta), file(sample_config), path("Solo.out") output: tuple val(meta), file("output") shell: ''' + sed 's/[ACTGN]*-//' < '!{sample_config}' > barcode.config for dir in Gene GeneFull GeneFull_Ex50pAS GeneFull_ExonOverIntron ; do - mkdir -p "output/$dir" - bash -x matrix2csv.sh "Solo.out/$dir/filtered/" > "output/$dir/counts.csv" + outdir=output/$dir + allcountsfile=$outdir/allcounts.csv + mkdir -p "$outdir" + bash matrix2csv.sh "Solo.out/$dir/filtered/" > "$allcountsfile" + cat barcode.config | while read name barcode ; do + cat "$allcountsfile" \ + | awk -F, -vbarcode=$barcode -vname=$name \ + '$1 == barcode { print $2 "," $3 "," $5 }' \ + > "$outdir/$name.counts.csv" + done + analyze.py "Solo.out/$dir/CellReads.stats" "barcode.config" "$outdir" done ''' } + +process create_sample_configs { + scratch false + executor "local" + // TODO: Take sample_config as val + // That way we don't rely on column ordering + input: + path sample_config + output: + file("configs/*") + + shell: + ''' + mkdir configs + awk < '!{sample_config}' \ + 'NR > 1 { print $2 "\t" $4 > "configs/" $1 "_lane" $3 ".config"}' + ''' +} diff --git a/processes/altseq/bin/analyze.py b/processes/altseq/bin/analyze.py new file mode 100755 index 00000000..1cd192f2 --- /dev/null +++ b/processes/altseq/bin/analyze.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import argparse +import csv +import os +import pathlib +import pprint + +def parse_args(): + parser = argparse.ArgumentParser( + prog="analyze.py", + description="Parses CellRanger-style output and summarizes by barcode", + ) + parser.add_argument("cellreads") + parser.add_argument("barcode_config_file") + parser.add_argument("output_directory") + return parser.parse_args() + + +def parse_barcode_config(filename): + cfg = {} + with open(filename) as f: + for line in f.readlines(): + (name, barcode) = line.strip().split("\t") + cfg[barcode] = name + return cfg + +def parse_cellreads(filename): + with open(filename) as f: + return [*csv.DictReader(f, delimiter="\t")] + +def write_sample(output_directory, sample): + if not sample.get("name", None): + # Skip barcodes not in our list + return + output = os.path.join(output_directory, ("%s.stats.txt" % sample["name"])) + output_keys = [ + "cbMatch", "cbPerfect", + "exonic", "intronic", + "mito", + "genomeU", "genomeM", + "featureU", "featureM", + "nGenesUnique", + "exonicAS", + "intronicAS", + ] + with open(output, 'w') as f: + for key in output_keys: + if key in sample: + f.write("%s\t%s\n" % (key, sample[key])) + + +def main(): + opts = parse_args() + cfg = parse_barcode_config(opts.barcode_config_file) + samples = parse_cellreads(opts.cellreads) + for sample in samples: + sample['name'] = cfg.get(sample['CB'], None) + + pathlib.Path(opts.output_directory).mkdir(parents=True, exist_ok=True) + for sample in samples: + write_sample(opts.output_directory, sample) + +if __name__ == "__main__": + main() diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index c76ae982..400cea9a 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -40,6 +40,7 @@ process { mode: "link", saveAs: { f -> f == "output" ? "${meta.name}/analysis" : null } ] + module = "openssl-dev/1.0.1t" } } @@ -49,6 +50,15 @@ profiles { withName: ".*:BCL2DEMUX:bcl2fastq.*" { module = "bcl2fastq2/2.20.0.422" } + //withName: "analyze_solo_dir" { + //} } } + singularity { + singularity.enabled = true + + // Bind in /net/seq/data2/sequencers as readonly + // This is necessary for the bcl2fastq step. + singularity.runOptions = "--bind /net/seq/data2/sequencers/:/net/seq/data2/sequencers:ro" + } } From f5dcc7661b7e3d43a739616027ccee168a6add42 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 14 Dec 2022 10:32:21 -0800 Subject: [PATCH 031/172] Reformat ./fetch.sh as Makefile --- third_party/Makefile | 6 ++++++ third_party/fetch.sh | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 third_party/Makefile delete mode 100755 third_party/fetch.sh diff --git a/third_party/Makefile b/third_party/Makefile new file mode 100644 index 00000000..6a166092 --- /dev/null +++ b/third_party/Makefile @@ -0,0 +1,6 @@ +all : STAR + +STAR: + wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220601/STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip + unzip STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip + rm STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip diff --git a/third_party/fetch.sh b/third_party/fetch.sh deleted file mode 100755 index a78ad65d..00000000 --- a/third_party/fetch.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# Fetch STAR -wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220601/STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip -unzip STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip -rm STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip From 2ce9f1ebcc1b675151f52193d4c394a3b1a35887 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 8 Jan 2023 16:13:13 -0800 Subject: [PATCH 032/172] Update altseq/upload_data.py --- scripts/altseq/upload_data.py | 613 ++++++++++++++++++++++------------ 1 file changed, 394 insertions(+), 219 deletions(-) diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index c6ae13df..cfd1a30b 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -1,5 +1,7 @@ -#pylint disable=invalid-whitespace, invalid-name - +#!/usr/bin/env python3 +""" +Uploads all the results of alt-seq processing to LIMS +""" import re import csv @@ -10,143 +12,247 @@ import logging import os import sys -import time +from functools import lru_cache from collections import defaultdict +# Make sure we can load our vendored stamlims_api dependency sys.path.insert( - 1, os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "..", - "lims", - "stamlims_api" -)) + 1, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "lims", "stamlims_api" + ), +) -from stamlims_api.lims import aggregations, content_types -from stamlims_api import rest -lane_tags = None -flowcell_lane_cache = dict() -flowcell_contenttype = None +from stamlims_api import rest # pylint: disable=wrong-import-position,import-error -log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -log = logging.getLogger('upload_data.py') +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +LOG = logging.getLogger("upload_data.py") script_options = { "base_api_url": None, - "basedir": os.getcwd(), "quiet": False, "debug": False, - + "dry_run": False, } -def parser_setup(): - - parser = argparse.ArgumentParser() - - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token.") +class HashableDict(dict): + """ + A simple hashable dict + Helps cache our GET requests even w/ query params + """ - parser.add_argument("sample_config", - help="The sample_config.tsv file") - parser.add_argument("processing_json", - help="The processing.json file") - parser.add_argument("--output_file_directory", default=".") + def __hash__(self): + return hash(frozenset(self.items())) - parser.add_argument("--skip_md5", dest="skip_md5", action="store_true", - help="Don't calculate md5sum") +def parser_setup(): + """Command-line argument setup""" + parser = argparse.ArgumentParser() - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + run_opts = parser.add_argument_group("core params") + log_opts = parser.add_argument_group("logging options") + lims_opts = parser.add_argument_group("lims options") + + log_opts.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages (only WARN and higher).", + ) + log_opts.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages.", + ) + + lims_opts.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + lims_opts.add_argument( + "-t", "--token", dest="token", help="Your authentication token." + ) + + run_opts.add_argument("sample_config", help="The sample_config.tsv file") + run_opts.add_argument("processing_json", help="The processing.json file") + run_opts.add_argument( + "--output_file_directory", + default=".", + help="The output directory files are stored in. Defaults to cwd.", + ) + + run_opts.add_argument( + "--skip_md5", + dest="skip_md5", + action="store_true", + help="Don't calculate md5sum (debug/dev only)", + ) + + run_opts.add_argument( + "-n", + "--dry_run", + dest="dry_run", + action="store_true", + help="Do not upload anything to LIMS, instead print actions that would be taken", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser def md5sum_file(path): + """Calculates the md5sum of a file's contents""" md5sum = hashlib.md5() - with open(path, 'rb') as f: - for chunk in iter(lambda: f.read(1024*1024), b''): + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): md5sum.update(chunk) return md5sum.hexdigest() -def url_join(*args): - url = "/".join([ x.rstrip('/') for x in args ]) - return url -class UploadLIMS(object): - - def __init__(self, api_url, token): - self.count_types = {} - self.flowcelllane_contenttype = None - self.alignment_contenttype = None - self.aggregation_contenttype = None - self.flowcell_lane_cache = {} - self.api = rest.setup_api({rest.LIMS_URL_OPT_VAR: api_url, - rest.LIMS_TOKEN_OPT_VAR: token, - rest.RAISE_ON_ERROR_VAR: True}) - self.get_cache = {} +def parse_counts_file(counts_file: str): + """ + Given a file name, reads a stats file + format: one stat per line: `name value` (separated by whitespace) + returns a dict of str->int + """ + stats = {} + with open(counts_file, "r") as counts: + for line in counts: + values = line.split() + count_type_name = values[0] + if not count_type_name: + continue + count = int(values[1]) + stats[count_type_name] = count + return stats + + +def build_counts(alignment_id, counts_file): + """ + Convert stats into a form ready to be uploaded to LIMS with the + bulk-stat-create endpoint + """ + parsed_stats = parse_counts_file(counts_file) + return { + "object_id": alignment_id, + "content_type": "SequencingData.flowcelllanealignment", + "stats": parsed_stats, + } + + +class UploadLIMS: + """ + Contains the logic for uploading things to LIMS + Uses caching for most GET requests + """ + + def __init__(self, api_url, token, dry_run=False, skip_md5=False): + # self.count_types = {} + # self.flowcelllane_contenttype = None + # self.alignment_contenttype = None + # self.aggregation_contenttype = None + self.api = rest.setup_api( + { + rest.LIMS_URL_OPT_VAR: api_url, + rest.LIMS_TOKEN_OPT_VAR: token, + rest.RAISE_ON_ERROR_VAR: True, + } + ) + self.dry_run = dry_run + self.skip_md5 = skip_md5 + @lru_cache(maxsize=None) def get(self, url): - if url not in self.get_cache: - self.get_cache[url] = self.api.get_single_result(url) - return self.get_cache[url] + """Cached version of api.get_single_result""" + return self.api.get_single_result(url) - def get_by_full_url(self, url): - if url not in self.get_cache: - self.get_cache[url] = self.api.get_single_result(url=url) - return self.get_cache[url] - - def get_by_id(self, base_url, id, message=None): - url = "%s/%d/" % (base_url, id) + def get_by_id(self, base_url, object_id, err_message=None): + """Constructs url from ID and calls get""" + url = "%s/%d/" % (base_url, object_id) result = self.get(url) if not result: - if message is None: - message = "Failed to fetch %s" % url - log.critical(message) + if err_message is None: + err_message = "Failed to fetch %s" % url + LOG.critical(err_message) return result - def get_single_result(self, fetch_url, query=None, field=None): - """ - Using a list API url that should bring up a single item, retrieve that single item if it exists. - """ - result = self.api.get_single_list_result(url_addition=fetch_url, query_arguments=query) + @lru_cache(maxsize=None) + def _get_single_result(self, fetch_url, query=None, field=None): + """Internal memo-izable function, do not use directly""" + result = self.api.get_single_list_result( + url_addition=fetch_url, query_arguments=query + ) if result is None: return None if field is not None: return result[field] return result - def get_list_result(self, url, query=None): - return self.api.get_list_result( - url_addition=url, - query_arguments=query, - item_limit=1000000, - page_size=1000, - ) + def get_single_result(self, fetch_url, query=None, field=None): + """ + Using a list API url that should bring up a single item, retrieve that + single item if it exists. + """ + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + return self._get_single_result(fetch_url, query, field) + + # Not currently used + # @lru_cache(maxsize=None) + # def _get_list_result(self, url, query=None): + # return self.api.get_list_result( + # url_addition=url, + # query_arguments=query, + # item_limit=1000000, + # page_size=1000, + # ) + # + # def get_list_result(self, url, query=None): + # if isinstance(query, dict) and not isinstance(query, HashableDict): + # query = HashableDict(query) + # LOG.debug("Query is now: %s", query) + # return self._get_list_result(url, query) def put(self, *args, **kwargs): - # TODO: s/patch/put/ + """ + PUT data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have put %s, %s", args, kwargs) + return None + # FIXME: Should use PUT method once API lib supports it return self.api.patch_single_result(*args, **kwargs) def post(self, *args, **kwargs): + """ + POST data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have post %s, %s", args, kwargs) + return None return self.api.post_single_result(*args, **kwargs) - def patch(self, *args, **kwargs): - return self.api.patch_single_result(*args, **kwargs) + # def patch(self, *args, **kwargs): + # if self.dry_run: + # LOG.info("Dry run, would have patch %s, %s", args, kwargs) + # return None + # return self.api.patch_single_result(*args, **kwargs) - def get_flowcell_url_by_label(self, label): - return self.get_single_result('flowcell_run/', - field = 'url', - query={"label":label}) + # def get_flowcell_url_by_label(self, label): + # return self.get_single_result( + # "flowcell_run/", field="url", query={"label": label} + # ) def get_contenttype(self, contenttype_name): """ @@ -156,185 +262,232 @@ def get_contenttype(self, contenttype_name): (appname, modelname) = contenttype_name.split(".") query = { - 'app_label': appname, - 'model': modelname, + "app_label": appname, + "model": modelname, } - ct = self.get_single_result('content_type/', query=query) + ct = self.get_single_result("content_type/", query=query) if not ct: - log.critical("Could not fetch content type %s" % contenttype_name) + LOG.critical("Could not fetch content type %s", contenttype_name) return ct def get_file_purpose_url(self, slug): - return self.get_single_result('file_purpose/', - query={"slug": slug}, - field="url") - - def get_file_type(self, slug): - return self.get_single_result('file_type/', - field="url", - query={"slug":slug}) + """Get file purpose url from slug""" + return self.get_single_result( + "file_purpose/", query={"slug": slug}, field="url" + ) + def get_file_type_url(self, slug): + """Gets the file type URL for a slug""" + return self.get_single_result("file_type/", field="url", query={"slug": slug}) - def upload_directory_attachment(self, path, contenttype_name, object_id, file_purpose=None): + def upload_directory_attachment( + self, path, contenttype_name, object_id, file_purpose=None + ): + """Uploads a single directory to a LIMS object""" path = os.path.abspath(path) - if not (contenttype_name and object_id): - log.error("Cannot attach file %s without both content type and object_id" % path) + LOG.error( + "Cannot attach file %s without both content type and object_id", path + ) return False contenttype = self.get_contenttype(contenttype_name) - if not contenttype: - log.error("Cannot attach file %s without contenttype result" % path) + LOG.error("Cannot attach file %s without contenttype result", path) return False purpose = self.get_file_purpose_url(file_purpose) - if file_purpose and not purpose: - log.error("Could not find file purpose %s for uploading directory %s" % (file_purpose, path)) + LOG.error( + "Could not find file purpose %s for uploading directory %s", + file_purpose, + path, + ) return False - elif purpose: - log.debug("File purpose: %s" % purpose) - - exists = self.get_single_result('directory/', query={"path":path}) + LOG.debug("File purpose: %s", purpose) + + existing_data = self.get_single_result("directory/", query={"path": path}) + data = existing_data if existing_data else {} + + data.update( + { + "path": path, + "content_type": contenttype["url"], + "object_id": object_id, + "purpose": purpose, + } + ) - if exists: - data = exists - else: - data = {} - - data.update({ - 'path': path, - 'content_type': contenttype['url'], - 'object_id': object_id, - 'purpose': purpose - }) - - if exists: - log.info("Updating information for directory %s" % path) - result = self.put(url=data['url'], data=data) + if existing_data: + LOG.info("Updating information for directory %s", path) + result = self.put(url=data["url"], data=data) else: - log.info("Uploading information for directory %s" % path) + LOG.info("Uploading information for directory %s", path) result = self.post("directory/", data=data) if not result: - log.error("Could not upload directory %s" % path) - log.debug(data) + LOG.error("Could not upload directory %s", path) + LOG.debug(data) else: - log.debug(result) + LOG.debug(result) return True - def upload_file(self, path, contenttype_name, object_ids, file_purpose=None, file_type=None, skip_md5=False): - log.info("Gathering data...") - upload_data = self.get_file_upload_data(path, contenttype_name, file_purpose, file_type, skip_md5) - if skip_md5: - log.info("Skipping md5sum") - upload_data['md5sum'] = '0' + def upload_file( + self, path, contenttype_name, object_ids, file_purpose=None, file_type=None + ): + """ + Upload a file's metadata to LIMS + It will be attached to many objects. + """ + # FIXME: This method makes a GET and PUT request for every single object + # Will require LIMS API updates to enable a more performant solution + + upload_data = self.get_file_upload_data( + path, contenttype_name, file_purpose, file_type + ) + if self.skip_md5: + LOG.info("Skipping md5sum") + upload_data["md5sum"] = "0" else: - log.info("Running md5sum...") - upload_data['md5sum'] = md5sum_file(path) + LOG.debug("Running md5sum...") + upload_data["md5sum"] = md5sum_file(path) - content_type_id = re.search("(\d+)/?$", upload_data['content_type']).group(1) - purpose_id = re.search("(\d+)/?$", upload_data['purpose']).group(1) + content_type_id = re.search(r"(\d+)/?$", upload_data["content_type"]).group(1) + purpose_id = re.search(r"(\d+)/?$", upload_data["purpose"]).group(1) for object_id in object_ids: - data = {"object_id": object_id, **upload_data} - exists = self.get_single_result("file/", - query={"object_id": object_id, - "purpose": purpose_id, - "content_type": content_type_id}) + data = upload_data.update({"object_id": object_id}) + exists = self.get_single_result( + "file/", + query={ + "object_id": object_id, + "purpose": purpose_id, + "content_type": content_type_id, + }, + ) if exists: - log.info("Updating information for file %s: lane %d" % (path, object_id)) - result = self.put(url=exists['url'], data=data) + if exists == data: + LOG.info( + "No change to information for file %s, lane %d, not updating", + path, + object_id, + ) + result = True + else: + LOG.info( + "Updating information for file %s: lane %d", path, object_id + ) + result = self.put(url=exists["url"], data=data) else: - log.info("Uploading information for file %s: lane %d" % (path, object_id)) + LOG.info("Uploading information for file %s: lane %d", path, object_id) result = self.post("file/", data=data) - + if not result: - log.error("Could not upload file %s for ID %d" % (path, object_id)) - log.debug(data) + LOG.error("Could not upload file %s for ID %d", path, object_id) + LOG.debug(data) else: - log.debug(result) - + LOG.debug(result) - - def get_file_upload_data(self, path, contenttype_name, file_purpose=None, file_type=None, skip_md5_check=False): + def get_file_upload_data( + self, path, contenttype_name, file_purpose=None, file_type=None + ): + """ + Gets the file upload data that is easy to query + (notable omission: md5sum, as it takes a long time to calculate) + """ path = os.path.abspath(path) - contenttype = self.get_contenttype(contenttype_name) - if not contenttype: - log.error("Cannot attach file %s without contenttype result" % path) + LOG.error("Cannot attach file %s without contenttype result", path) return False purpose = self.get_file_purpose_url(file_purpose) - if file_purpose and not purpose: - log.error("Could not find file purpose %s for uploading file %s" % (file_purpose, path)) + LOG.error( + "Could not find file purpose %s for uploading file %s", + file_purpose, + path, + ) return False - elif purpose: - log.debug("File Purpose: %s" % purpose) - - ftype = self.get_file_type(file_type) + if purpose: + LOG.debug("File Purpose: %s", purpose) + ftype = self.get_file_type_url(file_type) if file_type and not ftype: - log.error("Could not find file type %s for uploading file %s" % (file_type, path)) + LOG.error( + "Could not find file type %s for uploading file %s", file_type, path + ) return False - elif purpose: - log.debug("File Type: %s" % ftype) - + if file_type: + LOG.debug("File Type: %s", ftype) file_size = os.path.getsize(path) last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) - #if exists: - #recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) + # if exists: + # recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) # TODO: Make time-checking work! # Current issue: sub-second precision. data = { - 'path': path, - 'content_type': contenttype["url"], - 'purpose': purpose, - 'filetype': ftype, - 'file_last_modified': last_modified, - 'size_bytes': file_size, + "path": path, + "content_type": contenttype["url"], + "purpose": purpose, + "filetype": ftype, + "file_last_modified": last_modified, + "size_bytes": file_size, } - log.debug(data) + LOG.debug(data) return data - - def get_flowcelllane_contenttype(self): - if not self.flowcelllane_contenttype: - self.flowcelllane_contenttype = self.get_contenttype('SequencingData.flowcelllane') - return self.flowcelllane_contenttype - def get_flowcell_lane(self, flowcell_lane_id): - return self.get_by_id('flowcell_lane', flowcell_lane_id) + """Gets the flowcell lane by ID""" + return self.get_by_id("flowcell_lane", flowcell_lane_id) def get_library(self, library_id): - return self.get_by_id('library', library_id) - + """Gets the library by ID (NOT library number)""" + return self.get_by_id("library", library_id) def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): + """ + Main function for this script. + Given paths to the sample_config file, processing_dict, and outdir, + upload to LIMS: + 1) Paths for fastq files for each lane + 2) Stats for each alignment + """ # (Filepath, purpose) -> [lane_ids] files_to_upload = defaultdict(list) + + # Augment processing_dict with sample_config info + processing_info = [] + for row in sample_config: + barcode_index = row["barcode_index"] + lane = int(row["lane"]) + pool_name = row["pool_name"] + sample_name = row["sample_name"] + for idx, lib in enumerate(processing_dict["libraries"]): + if int(lib["lane"]) == lane and lib["barcode_index"] == barcode_index: + lib.update({"pool_name": pool_name, "sample_name": sample_name}) + processing_info.append(lib) + + # TODO: Doesn't yet make use of the above augmented info for row in sample_config: - idx = row['barcode_index'] - lane = int(row['lane']) - name = row['name'] + idx = row["barcode_index"] + lane = int(row["lane"]) + name = row["pool_name"] # Get lane IDs for each file lane_ids = [ - l['id'] - for l in processing_dict['libraries'] - if l['barcode1']['reverse_sequence'] == idx and int(l['lane']) == lane - ] - r1_file = os.path.join(outdir, "%s_R1.fq.gz" % name) - r2_file = os.path.join(outdir, "%s_R2.fq.gz" % name) + l["id"] + for l in processing_dict["libraries"] + if l["barcode1"]["reverse_sequence"] == idx and int(l["lane"]) == lane + ] + r1_file = os.path.join(outdir, name, "R1.fq.gz") + r2_file = os.path.join(outdir, name, "R2.fq.gz") if not os.path.exists(r1_file): raise Exception("No file %s" % r1_file) if not os.path.exists(r2_file): @@ -343,47 +496,65 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): files_to_upload[(r1_file, "r1-fastq")].extend(lane_ids) files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) + # Upload files. for ((path, purpose), lane_ids) in files_to_upload.items(): - print(path, purpose, len(lane_ids)) - - self.upload_file(path, - "SequencingData.flowcelllane", - lane_ids, - file_purpose=purpose, - file_type="fastq", - skip_md5=True) - - - -def main(args = sys.argv): + # print(path, purpose, len(lane_ids)) + self.upload_file( + path, + "SequencingData.flowcelllane", + lane_ids, + file_purpose=purpose, + file_type="fastq", + ) + + # Now upload counts. + # We can do this all as one call. + # (Assuming LIMS doesn't time out) + all_counts = [] + for lib in processing_info: + if not len(lib["alignments"]) == 1: + LOG.critical("Lib must have exactly 1 aligment %s", lib) + align_id = lib["alignments"][0]["id"] + counts_file = os.path.join( + outdir, + lib["pool_name"], + "analysis", + "Gene", + "%s.stats.txt" % lib["sample_name"], + ) + all_counts.append(build_counts(align_id, counts_file)) + # print(json.dumps(all_counts)) + self.post("stats/create/", all_counts) + + +def main(): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() if poptions.quiet: - logging.basicConfig(level=logging.WARNING, format=log_format) + logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT) elif poptions.debug: - logging.basicConfig(level=logging.DEBUG, format=log_format) + logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: # Set up the default logging levels - logging.basicConfig(level=logging.INFO, format=log_format) + logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) # Make this a little less noisy by default requests_log = logging.getLogger("requests.packages.urllib3.connectionpool") requests_log.setLevel(logging.WARN) if not poptions.base_api_url and "LIMS_API_URL" in os.environ: api_url = os.environ["LIMS_API_URL"] - log.debug("Using LIMS API endpoint: %s from environment" % api_url) + LOG.debug("Using LIMS API endpoint: %s from environment", api_url) elif poptions.base_api_url: api_url = poptions.base_api_url - log.debug("Using LIMS API endpoint: %s from options" % api_url) + LOG.debug("Using LIMS API endpoint: %s from options", api_url) else: sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) - if not poptions.token and "LIMS_API_TOKEN" in os.environ: token = os.environ["LIMS_API_TOKEN"] elif poptions.token: @@ -392,17 +563,21 @@ def main(args = sys.argv): sys.stderr.write("Could not find LIMS API TOKEN.\n") sys.exit(1) - uploader = UploadLIMS(api_url, token) + uploader = UploadLIMS( + api_url, token, dry_run=poptions.dry_run, skip_md5=poptions.skip_md5 + ) with open(poptions.sample_config) as f: - sample_config = [row for row in csv.DictReader(f, delimiter="\t")] + sample_config = list(csv.DictReader(f, delimiter="\t")) with open(poptions.processing_json) as f: processing = json.loads(f.read()) - uploader.upload_altseq_flowcell(sample_config, processing, poptions.output_file_directory) + uploader.upload_altseq_flowcell( + sample_config, processing, poptions.output_file_directory + ) # This is the main body of the program that only runs when running this script -# doesn't run when imported, so you can use the functions above in the shell after importing -# without automatically running it +# doesn't run when imported, so you can use the functions above in the shell +# after importing without automatically running it if __name__ == "__main__": main() From 0ea0fa8eb04c6c53e13fb9546ea0320e5fa6030f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 8 Jan 2023 16:15:26 -0800 Subject: [PATCH 033/172] Commit some python tooling files --- pyproject.toml | 20 +++++++ scripts/pyproject.toml | 21 +++++++ scripts/requirements.pip.dev.txt | 4 ++ scripts/requirements.pip.txt.lock | 99 +++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 pyproject.toml create mode 100644 scripts/pyproject.toml create mode 100644 scripts/requirements.pip.dev.txt create mode 100644 scripts/requirements.pip.txt.lock diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..254a3f88 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[tool.pyright] +include = ["scripts"] +exclude = ["**/__pycache__"] +#ignore = ["src/oldstuff"] +defineConstant = { STAMPIPES = "/home/nelsonjs/code/stampipes" } +#stubPath = "src/stubs" +#venv = "env367" + +reportMissingImports = true +reportMissingTypeStubs = false + +pythonVersion = "3.6" +pythonPlatform = "Linux" + +executionEnvironments = [ + #{ root = "src/web", pythonVersion = "3.5", pythonPlatform = "Windows", extraPaths = [ "src/service_libs" ] }, + #{ root = "src/sdk", pythonVersion = "3.0", extraPaths = [ "src/backend" ] }, + #{ root = "src/tests", extraPaths = ["src/tests/e2e", "src/sdk" ]}, + #{ root = "src" } +] diff --git a/scripts/pyproject.toml b/scripts/pyproject.toml new file mode 100644 index 00000000..4326ff6f --- /dev/null +++ b/scripts/pyproject.toml @@ -0,0 +1,21 @@ +[tool.black] +target-version = ['py35'] + + + +[tool.pylint.format] +max-line-length = 88 +[tool.pylint.basic] +# Allow 1 or 2 character names for variables +variable-rgx = "^[a-z][a-z0-9]*((_[a-z0-9]+)*)?$" +argument-rgx = "^[a-z][a-z0-9]*((_[a-z0-9]+)*)?$" + + +# This section requires https://pypi.org/project/Flake8-pyproject/ +[tool.flake8] +max-line-length = 88 +# Example config below +# ignore = ['E231', 'E241'] +# per-file-ignores = [ +# '__init__.py:F401', +# ] diff --git a/scripts/requirements.pip.dev.txt b/scripts/requirements.pip.dev.txt new file mode 100644 index 00000000..6005e575 --- /dev/null +++ b/scripts/requirements.pip.dev.txt @@ -0,0 +1,4 @@ +pytest # Our test runner +typing # This is only needed for python 3.5 and below + +pip<20.0.0 # Pip newer than 19.X doesn't work on python 3.5 diff --git a/scripts/requirements.pip.txt.lock b/scripts/requirements.pip.txt.lock new file mode 100644 index 00000000..2f98b6f3 --- /dev/null +++ b/scripts/requirements.pip.txt.lock @@ -0,0 +1,99 @@ +#afl-utils==1.34a0 +ansicolor==0.2.6 +appdirs==1.4.3 +astroid==2.4.2 +attrs==19.3.0 +autopep8==1.4.4 +backcall==0.1.0 +backoff==1.8.0 +#bio==0.1.0 +biopython==1.76 +bleach==3.1.0 +blessings==1.7 +bpython==0.18 +certifi==2019.3.9 +chardet==3.0.4 +Click==7.0 +colorama==0.4.1 +curtsies==0.3.0 +cutadapt==2.10 +cycler==0.10.0 +Cython==0.24 +decorator==4.4.1 +defusedxml==0.6.0 +distlib==0.3.0 +distro==1.4.0 +dnaio==0.4.2 +entrypoints==0.3 +#exploitable===1.32-rcor +filelock==3.0.12 +flake8==3.7.9 +Flask==1.0.2 +Flask-Cors==3.0.7 +gitdb2==2.0.5 +GitPython==2.1.11 +greenlet==0.4.15 +httpie==1.0.3 +idna==2.8 +importlib-metadata==0.23 +importlib-resources==1.0.2 +ipython==7.9.0 +ipython-genutils==0.2.0 +isort==4.3.21 +itsdangerous==1.1.0 +jedi==0.15.2 +Jinja2==2.10.1 +jsonschema==3.2.0 +jupyter-core==4.6.1 +lazy-object-proxy==1.4.3 +MarkupSafe==1.1.1 +mccabe==0.6.1 +mistune==0.8.4 +more-itertools==7.2.0 +nbconvert==5.6.1 +nbformat==4.4.0 +numpy==1.17.1 +pandoc-attributes==0.1.7 +pandocfilters==1.4.2 +parso==0.5.2 +pexpect==4.7.0 +pickleshare==0.7.5 +Pillow==6.0.0 +pipenv==2018.11.26 +prompt-toolkit==2.0.10 +ptyprocess==0.6.0 +pybind11==2.2.4 +pycodestyle==2.5.0 +pyflakes==2.1.1 +Pygments==2.4.2 +pylint==2.4.4 +pyparsing==2.1.5 +PyQt5==5.12.2 +PyQt5-sip==4.19.17 +pyrsistent==0.15.5 +python-dateutil==2.6.0 +pytz==2016.6.1 +PyYAML==5.1.2 +requests==2.21.0 +responses==0.10.6 +scikit-learn==0.19.1 +six==1.13.0 +sklearn==0.0 +smmap2==2.0.5 +#stamlims-api==0.1 +testpath==0.4.4 +traitlets==4.3.3 +twitter==1.18.0 +typed-ast==1.4.0 +typing==3.7.4.1 +urllib3==1.24.3 +vim-vint==0.3.21 +virtualenv==20.0.2 +virtualenv-clone==0.5.3 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.15.4 +wrapt==1.11.2 +xopen==0.8.4 +yapf==0.27.0 +zipp==0.6.0 From 67fad27ae83207b2783a3d469e5ce03f387da937 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 18 Jan 2023 16:37:11 -0800 Subject: [PATCH 034/172] Update altseq README with more starsolo info --- processes/altseq/README.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/processes/altseq/README.md b/processes/altseq/README.md index ecaeb6a8..ab76dda6 100644 --- a/processes/altseq/README.md +++ b/processes/altseq/README.md @@ -12,7 +12,23 @@ TODO ## Notes -We use StarSOLO, which is like CellRanger. Designed for single-cell analysis, we use it by telling the software that the barcode internal to the pool is a "cell barcode." All references by STAR and output files to "cell barcodes" actually refer to individual library barcodes. That is, each "cell barcode" corresponds to a DSnumber and LNnumber. +We use STARsolo, which is like CellRanger. Designed for single-cell analysis, we use it by telling the software that the barcode internal to the pool is a "cell barcode." All references by STAR and output files to "cell barcodes" actually refer to individual library barcodes. That is, each "cell barcode" corresponds to a DSnumber and LNnumber. + +### STARsolo output directories + +We get 4 output directories for STARSolo, each with their own way of counting results. + +`Gene/` +: counts reads that are fully concordant with gene transcript. + +`GeneFull/` +: counts all reads that overlap gene loci, including exonic and intronic reads. + +`GeneFull_Ex50pAS/` +: "Gene Full, Except 50% anti-sense." Excludes reads that map >50% in the antisense direction + +`GeneFull_ExonOverIntron` +: "GeneFull, prefer exon over intron." (This is important, for instance, for a read that overlap gene A exons and gene B introns (i.e. gene A exons are located within gene B introns). With GeneFull option, such a read will be ambiguous and not counted. With GeneFull_ExonOverIntron it will be counted towards gene A.) ### CellReads.stats interpretation @@ -23,42 +39,61 @@ Column descriptions: `CB` : cell barcode + `cbMatch` : number of reads that matched the cell barcode + `cbPerfect` : number of perfect match on cell barcode + `exonic` : number of reads mapping on exonic (only for `GeneFull_Ex50pAS` and `GeneFull_ExonOverIntron`) + `intronic` : number of reads mapping on intronic (only for `GeneFull_Ex50pAS` and `GeneFull_ExonOverIntron`) + `mito` : number of reads mapping on mitochondrial genome + `genomeU` : number of reads mapping to one locus in the genome + `genomeM` : number of reads mapping to multiple loci in the genome + `featureU` : number of reads mapping to one feature (Gene, GeneFull, etc) + `featureM` : number of reads mapping to multiple features + `cbMMunique` : number of reads with cell barcodes that map with mismatches to one barcode in the passlist + `cbMMmultiple` : number of reads with cell barcodes that map with mismatches to multiple barcodes in the passlist + `exonicAS` : number of reads mapping antisense to annotated exons (only for `GeneFull_Ex50pAS`) + `intronicAS` : number of reads mapping antisense to annotated introns (only for `GeneFull_Ex50pAS`) + `countedU` : number of unique-gene reads that were used in counting UMIs (!= number of UMIs), i.e. reads with valid CB/UMI/gene + `countedM` : number of multi-gene reads that were used in counting UMIs (!= number of UMIs), i.e. reads with valid CB/UMI/gene + `nUMIunique` : total number of counted UMI + `nGenesUnique` : number of genes having non 0 counts + `nUMImulti` : number of UMI for multi-gene reads, if requested + `nGenesMulti` : number of genes supported by just multi-gene reads, if requested From 232d536c3dd5e55ba1b8060d7e317cbce6672dac Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 18 Jan 2023 16:38:51 -0800 Subject: [PATCH 035/172] altseq: drop filtering step (unnecessary) --- processes/altseq/altseq.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 84816942..a36d8f52 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -218,7 +218,6 @@ process align { --soloUMIstart 13 \ --soloUMIlen 16 \ --soloCBwhitelist "!{barcode_whitelist}" \ - --soloCellFilter EmptyDrops_CR 96 .99 10 45000 90000 100000 0.01 20000 0.01 10000 \ --quantMode "TranscriptomeSAM" \ --soloFeatures Gene GeneFull GeneFull_ExonOverIntron GeneFull_Ex50pAS \ --soloMultiMappers Unique PropUnique Uniform Rescue EM \ @@ -260,7 +259,7 @@ process analyze_solo_dir { scratch false input: - tuple val(meta), file(sample_config), path("Solo.out") + tuple val(meta), path(sample_config), path("Solo.out") output: tuple val(meta), file("output") @@ -272,7 +271,7 @@ process analyze_solo_dir { outdir=output/$dir allcountsfile=$outdir/allcounts.csv mkdir -p "$outdir" - bash matrix2csv.sh "Solo.out/$dir/filtered/" > "$allcountsfile" + bash matrix2csv.sh "Solo.out/$dir/raw/" > "$allcountsfile" cat barcode.config | while read name barcode ; do cat "$allcountsfile" \ | awk -F, -vbarcode=$barcode -vname=$name \ From d2dea5a430fc7e77ccbbfcb69ecafe9f3f389c8a Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 18 Jan 2023 16:39:48 -0800 Subject: [PATCH 036/172] third_party: update STAR to fix intronicAS count (See: https://github.com/alexdobin/STAR/issues/1733) --- third_party/Makefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/third_party/Makefile b/third_party/Makefile index 6a166092..3e3bea51 100644 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -1,6 +1,9 @@ all : STAR STAR: - wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220601/STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip - unzip STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip - rm STAR_2.7.10a_alpha_220601_Linux_x86_64_static.zip + wget https://github.com/alexdobin/STAR/releases/download/2.7.10b_alpha_220111/STAR_2.7.10b_alpha_230111_Linux_x86_64_static.zip + unzip STAR_2.7.10b_alpha_230111_Linux_x86_64_static.zip + rm STAR_2.7.10b_alpha_230111_Linux_x86_64_static.zip + +clean: + -rm STAR From b5f01f3b42d865041ea1ea985b041a34342f80a6 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 22 Jan 2023 13:02:02 -0800 Subject: [PATCH 037/172] fix: don't duplicate aggs for processing --- scripts/aggregateprocess.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index f235d734..94c3d1ad 100644 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -391,9 +391,12 @@ def setup_flowcell(self, flowcell_label): self.setup_aggregations([a['id'] for a in aggregations]) def setup_aggregations(self, aggregation_ids): + # Deduplicate aggregations so we don't write the same one out twice + aggregation_ids = sorted(set(aggregation_ids)) + # The pool will "eat" exceptions, banishing them to the hopeless void - # This will log them instead, while not stopping other aggregations - # from setting up successfully + # This helper function will log them instead, while not stopping other + # aggregations from setting up successfully def try_setup(agg_id): try: self.setup_aggregation(agg_id) From 3771197fb4445b4283b2be9ebabc0b9900df8f97 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 13 Feb 2023 11:45:22 -0800 Subject: [PATCH 038/172] Altseq: Generate per-flowcell and per-pool stats --- processes/altseq/altseq.nf | 32 ++++- processes/altseq/bin/generate_counts_json.py | 132 +++++++++++++++++++ processes/altseq/nextflow.config | 12 +- processes/altseq/process_altseq.bash | 4 +- 4 files changed, 172 insertions(+), 8 deletions(-) create mode 100755 processes/altseq/bin/generate_counts_json.py diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index a36d8f52..1ab1d4a4 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -82,7 +82,7 @@ workflow ALTSEQ { // We don't want to do further processing on Undetermined samples it[0][0] != "Undetermined" } - // Now we group it together by pool name, lane, and read + // Now we group it together by pool name, lane, and read | groupTuple | map { readname, files -> [ @@ -118,7 +118,7 @@ workflow ALTSEQ { merged_fq_files, ) - // "Analyze" the results + // Gather statistics and summarize the results // First, we pair up the analysis with the expected list of samples // (This key will help us decode pool/barcode -> sample) @@ -133,8 +133,15 @@ workflow ALTSEQ { | map { key, meta, solodir, config -> [meta, config, solodir] } | set {to_analyze} + // Create pool-level stats and files analyze_solo_dir(to_analyze) + // Create flowcell stats + analyze_solo_dir.out.json_counts + | map {meta, counts -> counts} + | toSortedList + | merge_stats + // Sort the cram files align.out.aligned_bam | map { [ @@ -151,9 +158,9 @@ workflow ALTSEQ { } // Debugging section - use `nextflow run -dump-channels` to write channel contents to terminal - merged_fq_files.dump(tag: "merged_fq_files", pretty: true) - per_pool_sample_configs.dump(tag: "per_pool", pretty: true) - to_analyze.dump(tag: "to_analyze", pretty: true) + //merged_fq_files.dump(tag: "merged_fq_files") + //per_pool_sample_configs.dump(tag: "per_pool") + //to_analyze.dump(tag: "to_analyze") } workflow { @@ -263,6 +270,7 @@ process analyze_solo_dir { output: tuple val(meta), file("output") + tuple val(meta), file("output/counts.json"), emit: "json_counts" shell: ''' @@ -280,6 +288,7 @@ process analyze_solo_dir { done analyze.py "Solo.out/$dir/CellReads.stats" "barcode.config" "$outdir" done + generate_counts_json.py "Solo.out/GeneFull_Ex50pAS" "!{sample_config}" "!{meta.name}" > output/counts.json ''' } @@ -300,3 +309,16 @@ process create_sample_configs { 'NR > 1 { print $2 "\t" $4 > "configs/" $1 "_lane" $3 ".config"}' ''' } + +process merge_stats { + scratch false + executor "local" + input: + path("*.input.json") + output: + path("flowcell_stats.json") + shell: + ''' + jq --slurp --sort-keys -c . *.input.json > flowcell_stats.json + ''' +} diff --git a/processes/altseq/bin/generate_counts_json.py b/processes/altseq/bin/generate_counts_json.py new file mode 100755 index 00000000..f92ff012 --- /dev/null +++ b/processes/altseq/bin/generate_counts_json.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import os +import pathlib +# import pprint +import json + +def parse_args(): + parser = argparse.ArgumentParser( + prog="generate_counts_json.py", + description="Parses CellRanger-style output and produces a JSON file we can upload to LIMS", + ) + parser.add_argument("cellranger_directory") + parser.add_argument("barcode_config_file") + parser.add_argument("pool_name") + return parser.parse_args() + +def parse_tsv(filename): + """Parses a TSV with header, return list of dicts""" + with open(filename) as f: + return [*csv.DictReader(f, delimiter="\t")] + +def parse_linewise_stats(filename): + """Parses a file with a name-value pair on each line, separated by whitespace""" + d = {} + with open(filename) as f: + for line in f.readlines(): + (key, value) = line.strip().split() + d[key] = value + return d + +def parse_linewise_csv_stats(filename): + """Parses a file with a name-value pair on each line, separated by comma""" + d = {} + with open(filename) as f: + for line in f.readlines(): + (key, value) = line.strip().split(",") + d[key] = value + return d + + +def parse_barcode_config(filename): + """Parse the special barcode config file we use""" + cfg = {} + with open(filename) as f: + for line in f.readlines(): + (sample_name, barcode) = line.strip().split("\t") + (_illumina_barcode, cell_barcode) = barcode.split("-") + cfg[cell_barcode] = sample_name + return cfg + +def modify_sample_info(info): + """ Rewrite the sample stats a bit """ + # Keys to delete from the table + deletes = [ + "CB", + ] + # Keys to rename + renames = [ + ("cbMatch", "total"), + ] + out = info.copy() + for to_delete in deletes: + del out[to_delete] + for old, new in renames: + out[new] = out[old] + del out[old] + return out + +def get_sample_stats(opts): + """ + Gets per-sample stats from the CellReads.stats file + """ + cfg = parse_barcode_config(opts.barcode_config_file) + cellreads_path = os.path.join(opts.cellranger_directory, "CellReads.stats") + sample_counts = parse_tsv(cellreads_path) + + sample_stats = { + #cfg.get(info['CB']): modify_sample_info(info) + info['CB']: modify_sample_info(info) + for info in sample_counts + if info['CB'] in cfg + } + #del sample_stats[None] + return sample_stats + +def get_barcode_stats(opts): + """ Gets the stats about barcode mapping """ + barcode_path = os.path.join(opts.cellranger_directory, "..", "Barcodes.stats") + return parse_linewise_stats(barcode_path) + +def get_summary_stats(opts): + """ Gets the Summary stats produced by StarSOLO """ + barcode_path = os.path.join(opts.cellranger_directory, "Summary.csv") + return parse_linewise_csv_stats(barcode_path) + +def get_library_pool_info(opts): + """ Gets the metadata about the library and pool """ + (flowcell, pool) = opts.pool_name.split("_") + return {"flowcell_label": flowcell, "pool": pool} + +def get_barcode_mapping(opts): + """ Returns the mapping of barcodes to sample names """ + cfg = parse_barcode_config(opts.barcode_config_file) + return cfg + +def get_all_stats(opts): + """ + Return all the stats and metadata that this script gathers + Packaged as a single dict + """ + pool_info = get_library_pool_info(opts) + return { + "barcode_mapping": get_barcode_mapping(opts), + "barcode_stats": get_barcode_stats(opts), + "summary_stats": get_summary_stats(opts), + "samples": get_sample_stats(opts), + "pool": pool_info["pool"], + "flowcell_label": pool_info["flowcell_label"], + } + + +def main(): + """ Run it all and write to stdout """ + opts = parse_args() + data = get_all_stats(opts) + print(json.dumps(data)) + +if __name__ == "__main__": + main() diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index 400cea9a..53b0c45d 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -22,6 +22,7 @@ process { mode: "link", saveAs: { f -> "${meta.name}/sorted.cram" }, ] + scratch = false } // StarSOLO @@ -31,6 +32,7 @@ process { mode: "link", saveAs: { f -> f == "Solo.out" ? "${meta.name}/Solo.out" : null } ] + scratch = false } // StarSOLO analysis @@ -42,6 +44,14 @@ process { ] module = "openssl-dev/1.0.1t" } + + // Flowcell stats + withName: "merge_stats" { + publishDir = [ + path: { params.outdir }, + mode: "link", + ] + } } profiles { @@ -59,6 +69,6 @@ profiles { // Bind in /net/seq/data2/sequencers as readonly // This is necessary for the bcl2fastq step. - singularity.runOptions = "--bind /net/seq/data2/sequencers/:/net/seq/data2/sequencers:ro" + singularity.runOptions = "--bind /net/seq/data2/sequencers/:/net/seq/data2/sequencers:ro,/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/:/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/:ro" } } diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 4cb9cf3a..3f696e03 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,7 +10,7 @@ done set -eo pipefail -version=1.0.0 +version=1.1.0-alpha1 cd "$(dirname "$0")" @@ -77,7 +77,7 @@ python "$STAMPIPES/scripts/altseq/upload_data.py" \ # Create sentinel/status file if [[ -e "$status_file" ]] ; then - old_date=$(jq .completed_on << "$status_file") + old_date=$(jq .completed_on <<< "$status_file") old_status_file=${status_file/json/$old_date}.json mv "$status_file" "$old_status_file" fi From 4af94097f34d7aeb852a38a43e087b668f07a981 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 13 Feb 2023 15:20:50 -0800 Subject: [PATCH 039/172] altseq/upload_data.py uploads flowcell_counts.json --- scripts/altseq/upload_data.py | 105 +++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 26 deletions(-) diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index cfd1a30b..1f1062f3 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -3,6 +3,7 @@ Uploads all the results of alt-seq processing to LIMS """ +import pprint import re import csv import argparse @@ -26,6 +27,8 @@ from stamlims_api import rest # pylint: disable=wrong-import-position,import-error +JSON_REPORT_CLASS_SLUG = "altseq-flowcell-report-starsolo" + LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" LOG = logging.getLogger("upload_data.py") @@ -243,11 +246,11 @@ def post(self, *args, **kwargs): return None return self.api.post_single_result(*args, **kwargs) - # def patch(self, *args, **kwargs): - # if self.dry_run: - # LOG.info("Dry run, would have patch %s, %s", args, kwargs) - # return None - # return self.api.patch_single_result(*args, **kwargs) + def patch(self, *args, **kwargs): + if self.dry_run: + LOG.info("Dry run, would have patch %s, %s", args, kwargs) + return None + return self.api.patch_single_result(*args, **kwargs) # def get_flowcell_url_by_label(self, label): # return self.get_single_result( @@ -452,13 +455,56 @@ def get_library(self, library_id): """Gets the library by ID (NOT library number)""" return self.get_by_id("library", library_id) + def upload_flowcell_report(self, data): + flowcell_labels = set(pool["flowcell_label"] for pool in data) + assert len(flowcell_labels) == 1 + flowcell_label = flowcell_labels.pop() + + report_name = "Alt-seq stats: FC%s" % flowcell_label + + flowcell_lims_info = self.get_single_result( + "flowcell_run/?label=%s" % flowcell_label) + content_type = flowcell_lims_info['object_content_type'] + object_id = flowcell_lims_info['id'] + json_report_class = self.get_single_result( + "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) + + # See if report already exists + existing_reports = self.get("json_report/", query={ + "object_id": object_id, + "content_type": content_type, + "report_class": json_report_class["id"], + "page_size": 2, + })["results"] + + data_to_send = { + "object_id": object_id, + "content_type": content_type, + "report_class": json_report_class["id"], + "name": report_name, + "json_content": data, + } + if len(existing_reports) == 0: + self.post("json_report/", data=data_to_send) + # No report exists yet, upload a new one + elif len(existing_reports) == 1: + # Exactly one report, update it + data_to_send["id"] = existing_reports[0]["id"] + self.patch("json_report/", data=data_to_send) + else: + # Error! too many reports + LOG.critical("Too many JSON reports exist") + raise "Too many JSON reports exist, exiting" + + def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): """ Main function for this script. Given paths to the sample_config file, processing_dict, and outdir, upload to LIMS: 1) Paths for fastq files for each lane - 2) Stats for each alignment + # 2) Stats for each alignment + 3) Flowcell-level pool stats """ # (Filepath, purpose) -> [lane_ids] files_to_upload = defaultdict(list) @@ -507,29 +553,36 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): file_type="fastq", ) - # Now upload counts. - # We can do this all as one call. - # (Assuming LIMS doesn't time out) - all_counts = [] - for lib in processing_info: - if not len(lib["alignments"]) == 1: - LOG.critical("Lib must have exactly 1 aligment %s", lib) - align_id = lib["alignments"][0]["id"] - counts_file = os.path.join( - outdir, - lib["pool_name"], - "analysis", - "Gene", - "%s.stats.txt" % lib["sample_name"], - ) - all_counts.append(build_counts(align_id, counts_file)) - # print(json.dumps(all_counts)) - self.post("stats/create/", all_counts) + # Commented out because we aren't making alignments for these... + # # Now upload counts. + # # We can do this all as one call. + # # (Assuming LIMS doesn't time out) + # all_counts = [] + # for lib in processing_info: + # if not len(lib["alignments"]) == 1: + # LOG.critical("Lib must have exactly 1 aligment %s", lib) + # align_id = lib["alignments"][0]["id"] + # counts_file = os.path.join( + # outdir, + # lib["pool_name"], + # "analysis", + # "Gene", + # "%s.stats.txt" % lib["sample_name"], + # ) + # all_counts.append(build_counts(align_id, counts_file)) + # # print(json.dumps(all_counts)) + # self.post("stats/create/", all_counts) + + with open(os.path.join(outdir, "flowcell_stats.json")) as json_file: + flowcell_data = json.loads(json_file.read()) + self.upload_flowcell_report(flowcell_data) def main(): - """This is the main body of the program that by default uses the arguments - from the command line.""" + """ + This is the main body of the program that uses the arguments from the + command line. + """ parser = parser_setup() poptions = parser.parse_args() From 44b87eb02a1336b0b46f87969317d2c1a2aefeef Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 27 Feb 2023 11:35:52 -0800 Subject: [PATCH 040/172] Altseq - update stats-upload logic --- processes/altseq/altseq.nf | 2 +- scripts/altseq/upload_data.py | 51 +++++++++++++++++------------------ 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 1ab1d4a4..4ee7c9b3 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -122,7 +122,7 @@ workflow ALTSEQ { // First, we pair up the analysis with the expected list of samples // (This key will help us decode pool/barcode -> sample) - create_sample_configs(params.sample_config_tsv) + create_sample_configs(file(params.sample_config_tsv)) | flatten() | map { fn -> [fn.baseName, fn] } | set { per_pool_sample_configs } diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index 1f1062f3..c5238381 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -212,20 +212,20 @@ def get_single_result(self, fetch_url, query=None, field=None): return self._get_single_result(fetch_url, query, field) # Not currently used - # @lru_cache(maxsize=None) - # def _get_list_result(self, url, query=None): - # return self.api.get_list_result( - # url_addition=url, - # query_arguments=query, - # item_limit=1000000, - # page_size=1000, - # ) - # - # def get_list_result(self, url, query=None): - # if isinstance(query, dict) and not isinstance(query, HashableDict): - # query = HashableDict(query) - # LOG.debug("Query is now: %s", query) - # return self._get_list_result(url, query) + @lru_cache(maxsize=None) + def _get_list_result(self, url, query=None): + return self.api.get_list_result( + url_addition=url, + query_arguments=query, + item_limit=1000000, + page_size=1000, + ) + + def get_list_result(self, url, query=None): + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + LOG.debug("Query is now: %s", query) + return self._get_list_result(url, query) def put(self, *args, **kwargs): """ @@ -430,10 +430,6 @@ def get_file_upload_data( file_size = os.path.getsize(path) last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) - # if exists: - # recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) - - # TODO: Make time-checking work! # Current issue: sub-second precision. data = { "path": path, @@ -464,33 +460,34 @@ def upload_flowcell_report(self, data): flowcell_lims_info = self.get_single_result( "flowcell_run/?label=%s" % flowcell_label) - content_type = flowcell_lims_info['object_content_type'] + content_type_id = flowcell_lims_info['object_content_type'] + content_type = self.get_by_id("content_type", content_type_id) object_id = flowcell_lims_info['id'] json_report_class = self.get_single_result( "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) # See if report already exists - existing_reports = self.get("json_report/", query={ + existing_reports = self.get_list_result("json_report/", query={ "object_id": object_id, - "content_type": content_type, + "content_type": content_type["id"], "report_class": json_report_class["id"], "page_size": 2, - })["results"] + }) data_to_send = { "object_id": object_id, - "content_type": content_type, - "report_class": json_report_class["id"], + "content_type": content_type["url"], + "report_class": json_report_class["url"], "name": report_name, - "json_content": data, + "json_content": json.dumps(data), } if len(existing_reports) == 0: self.post("json_report/", data=data_to_send) # No report exists yet, upload a new one elif len(existing_reports) == 1: # Exactly one report, update it - data_to_send["id"] = existing_reports[0]["id"] - self.patch("json_report/", data=data_to_send) + url_to_patch = "json_report/%d/" % existing_reports[0]["id"] + self.patch(url_to_patch, data=data_to_send) else: # Error! too many reports LOG.critical("Too many JSON reports exist") From 3909a155c5f0b14b827c68139a6257df3ac3e249 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 27 Feb 2023 11:40:53 -0800 Subject: [PATCH 041/172] Altseq - use /net/seq/scratch for workdirs This will save us on storage/backup costs. --- processes/altseq/nextflow.config | 21 +++++++++++++++------ processes/altseq/process_altseq.bash | 14 ++++++++++---- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index 53b0c45d..0777afac 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -1,15 +1,20 @@ includeConfig "../../nextflow.config" process { + // temporary until we get /scratch/ available on these nodes + clusterOptions = "--exclude hpcz-0025,hpcz-0026,hpcz-0027,hpcz-0028" + // Configure publishing directives // Convention: All files are saved under ${params.outdir}/${meta.name} + cache = 'lenient' + // merged fastq is published (for now) withName : "merge_fq" { publishDir = [ path: { params.outdir }, - mode: "link", + mode: "copy", saveAs: { f -> f.replace("_R","/R") } // okay this feels really fragile. ] } @@ -19,8 +24,12 @@ process { withName: "sort_and_encode_cram" { publishDir = [ path: { params.outdir }, - mode: "link", - saveAs: { f -> "${meta.name}/sorted.cram" }, + mode: "copy", + saveAs: { f -> switch(f.tokenize('.').last()) { + case "cram": return "${meta.name}/sorted.cram" + case "crai": return "${meta.name}/sorted.cram.crai" + default: return null + }}, ] scratch = false } @@ -29,7 +38,7 @@ process { withName: "align" { publishDir = [ path: { params.outdir }, - mode: "link", + mode: "copy", saveAs: { f -> f == "Solo.out" ? "${meta.name}/Solo.out" : null } ] scratch = false @@ -39,7 +48,7 @@ process { withName: "analyze_solo_dir" { publishDir = [ path: { params.outdir }, - mode: "link", + mode: "copy", saveAs: { f -> f == "output" ? "${meta.name}/analysis" : null } ] module = "openssl-dev/1.0.1t" @@ -49,7 +58,7 @@ process { withName: "merge_stats" { publishDir = [ path: { params.outdir }, - mode: "link", + mode: "copy", ] } } diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 3f696e03..bee95683 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,7 +10,7 @@ done set -eo pipefail -version=1.1.0-alpha1 +version=1.1.0-alpha2 cd "$(dirname "$0")" @@ -45,13 +45,19 @@ sample_config=sample_config.tsv python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json --output "$sample_config" - SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt +WORKROOT=${WORKROOT:-/net/seq/scratch} +if ! [[ -d "$WORKROOT" ]] ; then + echo "WORKROOT '$WORKROOT' does not exist, using '$PWD'" + WORKROOT=$PWD +fi +WORKDIR=$WORKROOT/$USER/altseq/FC$FLOWCELL/work/ + # Run the pipeline NXF_VER=21.10.6 nextflow \ -c $STAMPIPES/nextflow.config \ @@ -60,13 +66,13 @@ NXF_VER=21.10.6 nextflow \ -ansi-log false \ -profile docker,cluster \ -resume \ + -work-dir "$WORKDIR" \ --input_directory "$SEQ_DIR" \ --sample_config_tsv "$sample_config" \ --genome_dir "$GENOME_DIR" \ --genome_fa "$GENOME_FA" \ --barcode_whitelist "$BARCODE_WHITELIST" \ - --outdir "$outdir" \ - --skip_alignment + --outdir "$outdir" # Upload fastq metadata From a4d6a4a8dafe13b2bad4142964c4204560ea52ad Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 19 Mar 2023 14:51:56 -0700 Subject: [PATCH 042/172] Fix altseq file upload --- scripts/altseq/upload_data.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index c5238381..a29e64a3 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -350,6 +350,7 @@ def upload_file( upload_data = self.get_file_upload_data( path, contenttype_name, file_purpose, file_type ) + LOG.debug("Uploading file %s, to %d objects", path, len(object_ids)) if self.skip_md5: LOG.info("Skipping md5sum") upload_data["md5sum"] = "0" @@ -360,7 +361,7 @@ def upload_file( content_type_id = re.search(r"(\d+)/?$", upload_data["content_type"]).group(1) purpose_id = re.search(r"(\d+)/?$", upload_data["purpose"]).group(1) for object_id in object_ids: - data = upload_data.update({"object_id": object_id}) + upload_data.update({"object_id": object_id}) exists = self.get_single_result( "file/", query={ @@ -371,7 +372,7 @@ def upload_file( ) if exists: - if exists == data: + if exists == upload_data: LOG.info( "No change to information for file %s, lane %d, not updating", path, @@ -382,14 +383,14 @@ def upload_file( LOG.info( "Updating information for file %s: lane %d", path, object_id ) - result = self.put(url=exists["url"], data=data) + result = self.put(url=exists["url"], data=upload_data) else: - LOG.info("Uploading information for file %s: lane %d", path, object_id) - result = self.post("file/", data=data) + LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + result = self.post("file/", data=upload_data) if not result: LOG.error("Could not upload file %s for ID %d", path, object_id) - LOG.debug(data) + LOG.debug(upload_data) else: LOG.debug(result) @@ -520,9 +521,10 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): # TODO: Doesn't yet make use of the above augmented info for row in sample_config: - idx = row["barcode_index"] + (idx, _otheridx) = row["barcode_index"].split("-") lane = int(row["lane"]) name = row["pool_name"] + LOG.debug("idx=%s, lane=%d, name=%s", idx, lane, name) # Get lane IDs for each file lane_ids = [ l["id"] @@ -545,7 +547,7 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): self.upload_file( path, "SequencingData.flowcelllane", - lane_ids, + list(set(lane_ids)), file_purpose=purpose, file_type="fastq", ) From 1469a04d5ba2bb266a3d984cb2d2572b3c2b2df8 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 21 Mar 2023 10:32:56 -0700 Subject: [PATCH 043/172] WIP - altseq uses CRAM compression in 1 step Still remaining TODO: Either compress or drop the Aligned.toTranscriptome.bam file Double check memory/tmp implications. --- processes/altseq/altseq.nf | 56 +++++++++++++++++++--------- processes/altseq/nextflow.config | 7 +++- processes/altseq/process_altseq.bash | 2 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 4ee7c9b3..dccb9e55 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -113,6 +113,7 @@ workflow ALTSEQ { // Invoke STAR Solo align( genome_dir, + genome_fa, params.star_exe, barcode_whitelist, merged_fq_files, @@ -142,19 +143,6 @@ workflow ALTSEQ { | toSortedList | merge_stats - // Sort the cram files - align.out.aligned_bam - | map { [ - [ - name: it[0].name, - id: it[0].name, - barcode_index: it[0].barcode_index, - lane: it[0].lane - ], - it[1], - genome_fa, - ] } - | sort_and_encode_cram } // Debugging section - use `nextflow run -dump-channels` to write channel contents to terminal @@ -200,21 +188,32 @@ process align { input: path genome_dir + path reference_fa path star_exe path barcode_whitelist tuple val(meta), path(fq1), path(fq2) output: - tuple val(meta), file("Aligned.out.bam"), emit: aligned_bam + tuple val(meta), file("Aligned.out.cram"), emit: aligned_cram tuple val(meta), file("Solo.out"), emit: solo_directory shell: cpus = 5 + cram_fmt_options = [ + "version=3.0", + "level=7", + "lossy_names=0", + ].join(",") + bam_fifo = "Aligned.out.bam" ''' + set -e tmpdir=$(mktemp -d) + echo "Tmpdir is: ${tmpdir}" + mkfifo "!{bam_fifo}" "./!{star_exe}" \ --genomeDir "!{genome_dir}" \ + --genomeLoad LoadAndRemove \ --readFilesIn "!{fq2}" "!{fq1}" \ --soloType CB_UMI_Simple \ --soloCellReadStats Standard \ @@ -226,7 +225,7 @@ process align { --soloUMIlen 16 \ --soloCBwhitelist "!{barcode_whitelist}" \ --quantMode "TranscriptomeSAM" \ - --soloFeatures Gene GeneFull GeneFull_ExonOverIntron GeneFull_Ex50pAS \ + --soloFeatures GeneFull_Ex50pAS \ --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --readFilesCommand zcat \ --runThreadN "!{cpus}" \ @@ -234,7 +233,30 @@ process align { --outSAMattributes NH HI AS NM MD CR CY UR UY GX GN \ --outSAMunmapped Within \ --limitOutSJcollapsed 5000000 \ - --outTmpDir "$tmpdir/STARSolo" + --outTmpDir "$tmpdir/STARSolo" \ + & # Launch in background, so we can convert to cram from pipe + + samtools sort \ + --reference "!{reference_fa}" \ + --output-fmt-option "!{cram_fmt_options}" \ + --threads "!{cpus}" \ + -@ "!{cpus}" \ + -o Aligned.out.cram \ + --write-index \ + -m 2G \ + -M \ + "!{bam_fifo}" + + + wait # probably not necessary + rm -rf "$tmpdir" # FIXME: This doesn't get called if STAR crashes or if NF cancels + if [[ $(wc -l Solo.out/GeneFull_Ex50pAS/CellReads.stats) -le 2 ]] ; then + echo -e "CellReads.stats does not contain enough output.\n" \ + "This may be caused by running out of /tmp space.\n" \ + "Alternately, it could be that no matching barcodes were found.\n" \ + >&2 + exit 1 + fi ''' } @@ -275,7 +297,7 @@ process analyze_solo_dir { shell: ''' sed 's/[ACTGN]*-//' < '!{sample_config}' > barcode.config - for dir in Gene GeneFull GeneFull_Ex50pAS GeneFull_ExonOverIntron ; do + for dir in GeneFull_Ex50pAS ; do outdir=output/$dir allcountsfile=$outdir/allcounts.csv mkdir -p "$outdir" diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index 0777afac..1ccef2c1 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -32,6 +32,7 @@ process { }}, ] scratch = false + memory = "32G" } // StarSOLO @@ -42,6 +43,7 @@ process { saveAs: { f -> f == "Solo.out" ? "${meta.name}/Solo.out" : null } ] scratch = false + module = "samtools/1.14" } // StarSOLO analysis @@ -69,8 +71,9 @@ profiles { withName: ".*:BCL2DEMUX:bcl2fastq.*" { module = "bcl2fastq2/2.20.0.422" } - //withName: "analyze_solo_dir" { - //} + withName: ".*ALTSEQ:align.*" { + module = "samtools/1.12" + } } } singularity { diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index bee95683..7fbc0040 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,7 +10,7 @@ done set -eo pipefail -version=1.1.0-alpha2 +version=1.1.0-alpha3 cd "$(dirname "$0")" From 21d17f8b1ecb5cc46bf400cfc7ce6c1bd32c8ebf Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 21 Mar 2023 10:47:26 -0700 Subject: [PATCH 044/172] STAR v1.2: Remove anaquin from pipeline --- README.md | 1 - .../aggregation/cufflinks_featurecounts.nf | 38 +++++++++---------- .../aggregation/cufflinks_featurecounts.sh | 2 +- scripts/rna-star/aggregate/concat_metrics.sh | 19 ---------- scripts/rna-star/aggregate/reset.bash | 5 --- scripts/versions.bash | 7 +--- 6 files changed, 20 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index e60ee441..ea517519 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,6 @@ Modules The current modules to load for working the pipeline are: -* `anaquin/2.0.1` * `bcl2fastq/1.8.4` * `bcl2fastq2/2.15.0.4` * `bedops/2.4.35-typical` diff --git a/processes/rna-star/aggregation/cufflinks_featurecounts.nf b/processes/rna-star/aggregation/cufflinks_featurecounts.nf index a7506ae5..0f82e94a 100644 --- a/processes/rna-star/aggregation/cufflinks_featurecounts.nf +++ b/processes/rna-star/aggregation/cufflinks_featurecounts.nf @@ -67,7 +67,7 @@ workflow RNA_AGG { kallisto(fastq, kallisto_index, sequins_iso_mix) kallisto_advanced(fastq, kallisto_index, sequins_iso_mix) - anaquin(bam_to_use, sequins_ref, kallisto_index, neat_mix_A, sequins_iso_mix) + // anaquin(bam_to_use, sequins_ref, kallisto_index, neat_mix_A, sequins_iso_mix) // QC Metrics insert_sizes(bam_to_use) @@ -276,7 +276,7 @@ process density { process cufflinks { publishDir params.outdir, mode: params.publishmode - module "cufflinks/2.2.1", "R/3.2.5", "anaquin/2.0.1" + module "cufflinks/2.2.1", "R/3.2.5" input: path input_bam @@ -285,7 +285,7 @@ process cufflinks { output: - tuple path("genes.fpkm_tracking"), path("isoforms.fpkm_tracking"), path("anaquin_cufflinks/*") + tuple path("genes.fpkm_tracking"), path("isoforms.fpkm_tracking") path "skipped.gtf" path "transcripts.gtf" @@ -304,11 +304,11 @@ process cufflinks { mv genes.fpkm_tracking.sort genes.fpkm_tracking mv isoforms.fpkm_tracking.sort isoforms.fpkm_tracking - # quantification with anaquin Rna Expression - anaquin RnaExpression -o anaquin_cufflinks -rmix "$sequins_iso_mix" -usequin transcripts.gtf -mix A \ - || (echo "NA" > anaquin_cufflinks/RnaExpression_genes.tsv \ - && echo "NA" > anaquin_cufflinks/RnaExpression_isoforms.tsv \ - && echo "NA" > anaquin_cufflinks/RnaExpression_summary.stats) + # # quantification with anaquin Rna Expression + # anaquin RnaExpression -o anaquin_cufflinks -rmix "$sequins_iso_mix" -usequin transcripts.gtf -mix A \ + # || (echo "NA" > anaquin_cufflinks/RnaExpression_genes.tsv \ + # && echo "NA" > anaquin_cufflinks/RnaExpression_isoforms.tsv \ + # && echo "NA" > anaquin_cufflinks/RnaExpression_summary.stats) """ } @@ -359,7 +359,7 @@ process feature_counts { process kallisto { publishDir params.outdir, mode: params.publishmode - module "kallisto/0.43.1", "anaquin/2.0.1" + module "kallisto/0.43.1" input: tuple path(r1_fq), path(r2_fq) @@ -367,7 +367,6 @@ process kallisto { path sequins_iso_mix output: - path "anaquin_kallisto/*" path "kallisto_output/*" path "kallisto.log" @@ -375,17 +374,17 @@ process kallisto { """ kallisto quant -i "${kallisto_index}" -o kallisto_output "${r1_fq}" "${r2_fq}" 2> kallisto.log - anaquin RnaExpression -o anaquin_kallisto -rmix "${sequins_iso_mix}" -usequin kallisto_output/abundance.tsv -mix A \ - || (echo "NA" > anaquin_kallisto/RnaExpression_genes.tsv \ - && echo "NA" > anaquin_kallisto/RnaExpression_isoforms.tsv \ - && echo "NA" > anaquin_kallisto/RnaExpression_summary.stats) + #anaquin RnaExpression -o anaquin_kallisto -rmix "${sequins_iso_mix}" -usequin kallisto_output/abundance.tsv -mix A \ + #|| (echo "NA" > anaquin_kallisto/RnaExpression_genes.tsv \ + # && echo "NA" > anaquin_kallisto/RnaExpression_isoforms.tsv \ + # && echo "NA" > anaquin_kallisto/RnaExpression_summary.stats) """ } process kallisto_advanced { publishDir params.outdir, mode: params.publishmode - module "kallisto/0.43.1", "anaquin/2.0.1" + module "kallisto/0.43.1" input: tuple path(r1_fq), path(r2_fq) @@ -393,7 +392,6 @@ process kallisto_advanced { path sequins_iso_mix output: - path "anaquin_kallisto_adv/*" path "kallisto_output_adv/*" path "kallisto_adv.log" @@ -401,10 +399,10 @@ process kallisto_advanced { """ kallisto quant --bias -b 100 --rf-stranded -i "${kallisto_index}" -o kallisto_output_adv "${r1_fq}" "${r2_fq}" 2> kallisto_adv.log - anaquin RnaExpression -o anaquin_kallisto_adv -rmix "${sequins_iso_mix}" -usequin kallisto_output_adv/abundance.tsv -mix A \ - || (echo "NA" > anaquin_kallisto_adv/RnaExpression_genes.tsv \ - && echo "NA" > anaquin_kallisto_adv/RnaExpression_isoforms.tsv \ - && echo "NA" > anaquin_kallisto_adv/RnaExpression_summary.stats) + # anaquin RnaExpression -o anaquin_kallisto_adv -rmix "${sequins_iso_mix}" -usequin kallisto_output_adv/abundance.tsv -mix A \ + # || (echo "NA" > anaquin_kallisto_adv/RnaExpression_genes.tsv \ + # && echo "NA" > anaquin_kallisto_adv/RnaExpression_isoforms.tsv \ + # && echo "NA" > anaquin_kallisto_adv/RnaExpression_summary.stats) """ } diff --git a/processes/rna-star/aggregation/cufflinks_featurecounts.sh b/processes/rna-star/aggregation/cufflinks_featurecounts.sh index d5fcb22d..d104dd16 100644 --- a/processes/rna-star/aggregation/cufflinks_featurecounts.sh +++ b/processes/rna-star/aggregation/cufflinks_featurecounts.sh @@ -1,4 +1,4 @@ -VERSION=1.1 +VERSION=1.2 OUT_DIR=output_$VERSION mkdir -p "$OUT_DIR" diff --git a/scripts/rna-star/aggregate/concat_metrics.sh b/scripts/rna-star/aggregate/concat_metrics.sh index f6ce9d90..97c74a33 100644 --- a/scripts/rna-star/aggregate/concat_metrics.sh +++ b/scripts/rna-star/aggregate/concat_metrics.sh @@ -18,23 +18,4 @@ rm -f metrics.info cat ribosomal_counts.info | grep 'ribosomal' cat adapter_counts.info | grep 'adapter' - if [ -s anaquin_star/RnaAlign_summary.stats.info ]; then - cat anaquin_star/RnaAlign_summary.stats.info | grep 'sequins-dilution' - cat anaquin_star/RnaAlign_summary.stats.info | grep 'sequins-base-level-sensitivity' - cat anaquin_star/RnaAlign_summary.stats.info | grep 'sequins-base-level-precision' - fi - - if [ -s anaquin_subsample/anaquin_kallisto/RnaExpression_isoforms.neatmix.tsv.info ]; then - cat anaquin_subsample/anaquin_kallisto/RnaExpression_isoforms.neatmix.tsv.info | grep 'neat-mixA-mean-spearman' - fi - - if [ -s anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info ]; then - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-isoforms-log2-pearson-cor' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-genes-slope' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-genes-log2-pearson-cor' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-percent-isoforms-found' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-percent-genes-found' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-detection-sensitivity-isoforms' - cat anaquin_subsample/anaquin_kallisto/RnaExpression_summary.stats.info | grep 'sequins-detection-sensitivity-genes' - fi } > metrics.info diff --git a/scripts/rna-star/aggregate/reset.bash b/scripts/rna-star/aggregate/reset.bash index 94e2d6bb..43393614 100644 --- a/scripts/rna-star/aggregate/reset.bash +++ b/scripts/rna-star/aggregate/reset.bash @@ -44,12 +44,7 @@ files=( \ ) dirs=( \ - "anaquin_cufflinks" \ - "anaquin_kallisto" \ - "anaquin_kallisto_adv" \ - "anaquin_star" \ "kallisto_output" \ - "anaquin_subsample" \ "kallisto_output_adv" \ ) diff --git a/scripts/versions.bash b/scripts/versions.bash index 8de23c6e..1c70a2e4 100644 --- a/scripts/versions.bash +++ b/scripts/versions.bash @@ -88,12 +88,7 @@ echo "R:" R --version | grep "R version" fi -if [[ `command -v anaquin` ]]; then -echo "Anaquin:" -anaquin | grep 'Version' -fi - if [[ `command -v tabix` ]]; then echo "Tabix/BGZIP:" which tabix | sed -e 's/.*htslib\///g' | sed -e 's/\/bin\/tabix//g' -fi \ No newline at end of file +fi From 995a276877eb77cb27f2ce30c37d3490bee31035 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 28 Mar 2023 11:04:40 -0700 Subject: [PATCH 045/172] Fix up cram sorting, add starsolo Log publishing --- processes/altseq/altseq.nf | 9 +++++---- processes/altseq/nextflow.config | 10 +++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index dccb9e55..4248f2fc 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -195,8 +195,9 @@ process align { output: - tuple val(meta), file("Aligned.out.cram"), emit: aligned_cram + tuple val(meta), file("Aligned.out.cram"), file("Aligned.out.cram.crai"), emit: aligned_cram tuple val(meta), file("Solo.out"), emit: solo_directory + tuple val(meta), file("Log.out"), file("Log.final.out"), emit: logs shell: cpus = 5 @@ -224,13 +225,13 @@ process align { --soloUMIstart 13 \ --soloUMIlen 16 \ --soloCBwhitelist "!{barcode_whitelist}" \ - --quantMode "TranscriptomeSAM" \ --soloFeatures GeneFull_Ex50pAS \ - --soloMultiMappers Unique PropUnique Uniform Rescue EM \ + --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --readFilesCommand zcat \ --runThreadN "!{cpus}" \ --outSAMtype BAM Unsorted \ - --outSAMattributes NH HI AS NM MD CR CY UR UY GX GN \ + --outBAMcompression 0 \ + --outSAMattributes NH HI AS NM MD CR CY UR UY GX GN \ --outSAMunmapped Within \ --limitOutSJcollapsed 5000000 \ --outTmpDir "$tmpdir/STARSolo" \ diff --git a/processes/altseq/nextflow.config b/processes/altseq/nextflow.config index 1ccef2c1..7644b9ab 100644 --- a/processes/altseq/nextflow.config +++ b/processes/altseq/nextflow.config @@ -40,7 +40,15 @@ process { publishDir = [ path: { params.outdir }, mode: "copy", - saveAs: { f -> f == "Solo.out" ? "${meta.name}/Solo.out" : null } + saveAs: { f -> + // Nested ternary is ugly, think of it like a switch statement. + f == "Solo.out" ? "${meta.name}/Solo.out" : ( + f.startsWith("Log") ? "${meta.name}/logs/STARsolo/${f}" : ( + f.endsWith("cram") ? "${meta.name}/sorted.cram" : ( + f.endsWith("crai") ? "${meta.name}/sorted.cram.crai" : + null + ))) + } ] scratch = false module = "samtools/1.14" From 1dd04f161bc84e5b722d366402fc612db197da31 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 29 Mar 2023 16:31:02 -0700 Subject: [PATCH 046/172] STAR uploads the right all-alignments-bam file --- scripts/rna-star/aggregate/attachfiles.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rna-star/aggregate/attachfiles.sh b/scripts/rna-star/aggregate/attachfiles.sh index 4e6aa25a..1f137788 100644 --- a/scripts/rna-star/aggregate/attachfiles.sh +++ b/scripts/rna-star/aggregate/attachfiles.sh @@ -25,7 +25,7 @@ function attach_agg_file () { attach_aggregation --attach_directory "$PWD/.." --attach_file_purpose aggregation-directory # alignments -attach_agg_file merged.genome.cram all-alignments-bam cram +attach_agg_file dupsmarked.cram all-alignments-bam cram attach_agg_file merged.transcriptome.cram transcriptome-alignments cram # cufflinks From c58d0ccb09b172a96ca550d71ddfe8d6a7f4a2fb Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 3 Apr 2023 11:31:44 -0700 Subject: [PATCH 047/172] Altseq 1.1.0 alpha4 - fix ref file, publish `Gene` Use slightly different reference file calculate counts two ways: `Gene` for downstream scripts `GeneFull_Exp50AS` for stats --- processes/altseq/altseq.nf | 2 +- processes/altseq/process_altseq.bash | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 4248f2fc..e4f75895 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -225,7 +225,7 @@ process align { --soloUMIstart 13 \ --soloUMIlen 16 \ --soloCBwhitelist "!{barcode_whitelist}" \ - --soloFeatures GeneFull_Ex50pAS \ + --soloFeatures Gene GeneFull_Ex50pAS \ --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --readFilesCommand zcat \ --runThreadN "!{cpus}" \ diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 7fbc0040..d463e020 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,7 +10,7 @@ done set -eo pipefail -version=1.1.0-alpha3 +version=1.1.0-alpha4 cd "$(dirname "$0")" @@ -47,8 +47,10 @@ python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) -GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ -GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified +#GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ +#GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified +GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome/ +GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/refdata-gex-GRCh38-2020-A/fasta/genome.fa BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt WORKROOT=${WORKROOT:-/net/seq/scratch} @@ -64,7 +66,7 @@ NXF_VER=21.10.6 nextflow \ run "$STAMPIPES"/processes/altseq/altseq.nf \ -with-trace \ -ansi-log false \ - -profile docker,cluster \ + -profile cluster \ -resume \ -work-dir "$WORKDIR" \ --input_directory "$SEQ_DIR" \ From 173ee8bfc39f6bdbeefcaed77e920d31eefffd00 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 20 Apr 2023 16:41:22 -0700 Subject: [PATCH 048/172] Alt-seq: fix for flowcells with 1 single pool --- processes/altseq/altseq.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index 4248f2fc..46fa429f 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -337,11 +337,11 @@ process merge_stats { scratch false executor "local" input: - path("*.input.json") + path("input.???.json") output: path("flowcell_stats.json") shell: ''' - jq --slurp --sort-keys -c . *.input.json > flowcell_stats.json + jq --slurp --sort-keys -c . input.*.json > flowcell_stats.json ''' } From a15e3ce7da52ef4726de21fa92ebd81a7301d039 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 May 2023 11:24:29 -0700 Subject: [PATCH 049/172] Update docker container path for dnase alns This will let it pull the deps down, rather than require a local build. --- processes/bwa/nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/bwa/nextflow.config b/processes/bwa/nextflow.config index b3f701c4..24526f6c 100644 --- a/processes/bwa/nextflow.config +++ b/processes/bwa/nextflow.config @@ -2,7 +2,7 @@ includeConfig "../../nextflow.config" profiles { docker { - process.container = 'stampipes:latest' + process.container = 'fwip/stampipes:latest' } modules { From 1794ab4ca3ab0c50b3216a2a6b5c3c7c7eef5eeb Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 May 2023 15:06:01 -0700 Subject: [PATCH 050/172] Fix docker config for dnase agg tests Bunch of little changes: Update samtools to 1.12 (mirroring the modules change made a while ago) Fix preseq checkout Require gawk Standardize \$STAMPIPES reference More tightly specify pip deps Add partial hotspot index --- Dockerfile | 13 +++++++------ processes/bwa/aggregate/basic.nf | 4 ++-- processes/bwa/aggregate/nextflow.config | 3 ++- requirements.pip.txt | 22 ++++++++++++++++++---- test_data/ref/chr22.fa.fa | 1 + test_data/ref/chr22.hotspot_index.starch | 3 +++ 6 files changed, 33 insertions(+), 13 deletions(-) create mode 120000 test_data/ref/chr22.fa.fa create mode 100644 test_data/ref/chr22.hotspot_index.starch diff --git a/Dockerfile b/Dockerfile index 50b77608..97df0c5a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -65,14 +65,15 @@ RUN apt-get install -y \ g++ \ git \ libbz2-dev \ + libcurl4-openssl-dev \ liblzma-dev \ make \ ncurses-dev \ wget \ zlib1g-dev -RUN wget --quiet https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 \ - && tar xf samtools-1.7.tar.bz2 \ - && cd samtools-1.7 \ +RUN wget --quiet https://github.com/samtools/samtools/releases/download/1.12/samtools-1.12.tar.bz2 \ + && tar xf samtools-1.12.tar.bz2 \ + && cd samtools-1.12 \ && make install ##################### @@ -151,9 +152,8 @@ RUN wget --quiet https://github.com/arq5x/bedtools2/releases/download/v2.25.0/be from build-base as build-preseq RUN apt-get install -y \ libgsl-dev -RUN git clone --recurse-submodules https://github.com/smithlabcode/preseq.git \ +RUN git clone --recurse-submodules -b v2.0.1 --single-branch https://github.com/smithlabcode/preseq.git \ && cd preseq \ - && git checkout v2.0.1 \ && make @@ -168,8 +168,9 @@ RUN apt-get install -y \ bc \ bowtie \ build-essential \ - libboost-dev \ coreutils \ + gawk \ + libboost-dev \ libgsl-dev \ littler \ openjdk-8-jre \ diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index b91538e5..391dbaba 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -472,7 +472,7 @@ process multimapping_density { > mm_density.starch # Bigwig - "/home/solexa/stampipes/scripts/bwa/starch_to_bigwig.bash" \ + "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ mm_density.starch \ mm_density.bw \ "!{fai}" \ @@ -495,7 +495,7 @@ process multimapping_density { print $1 "\t" $2 "\t" $3 "\t" $4 "\t" n }' \ | starch - > normalized.mm_density.starch - "$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ + "\$STAMPIPES/scripts/bwa/starch_to_bigwig.bash" \ normalized.mm_density.starch \ normalized.mm_density.bw \ "!{fai}" \ diff --git a/processes/bwa/aggregate/nextflow.config b/processes/bwa/aggregate/nextflow.config index 5b77a372..0a5986a1 100644 --- a/processes/bwa/aggregate/nextflow.config +++ b/processes/bwa/aggregate/nextflow.config @@ -14,6 +14,7 @@ profiles { enabled = true fixOwnership = true temp = 'auto' + runOptions = '-e "STAMPIPES"' } process { container = 'fwip/stampipes:latest' @@ -43,7 +44,7 @@ profiles { chrom_sizes = "$baseDir/../../../data/hotspot2/chr22.chrom_sizes.bed" centers = "$baseDir/../../../data/hotspot2/chr22.K36.center_sites.n100.starch" chrom_bucket = "$baseDir/../../../data/densities/chrom-buckets.chr22.75_20.bed.starch" - hotspot_index = "/net/seq/data/dhs_indexes/WM20180608b/master_index.bed.starch" + hotspot_index = "$baseDir/../../../test_data/ref/chr22.hotspot_index.starch" bias = "$baseDir/../../../data/footprints/vierstra_et_al.txt" chunksize = 5000 domotifs = true diff --git a/requirements.pip.txt b/requirements.pip.txt index 9abfb4d0..7279160f 100644 --- a/requirements.pip.txt +++ b/requirements.pip.txt @@ -1,4 +1,18 @@ -numpy -pysam -scipy -sklearn +asn1crypto==0.24.0 +cryptography==2.1.4 +enum34==1.1.6 +idna==2.6 +ipaddress==1.0.17 +keyring==10.6.0 +keyrings.alt==3.0 +numpy==1.14.5 +pycrypto==2.6.1 +pygobject==3.26.1 +pysam==0.14.1 +pyxdg==0.25 +scikit-learn==0.19.1 +scipy==1.1.0 +SecretStorage==2.3.1 +six==1.11.0 +sklearn==0.0 + diff --git a/test_data/ref/chr22.fa.fa b/test_data/ref/chr22.fa.fa new file mode 120000 index 00000000..2bb8f06f --- /dev/null +++ b/test_data/ref/chr22.fa.fa @@ -0,0 +1 @@ +chr22.fa \ No newline at end of file diff --git a/test_data/ref/chr22.hotspot_index.starch b/test_data/ref/chr22.hotspot_index.starch new file mode 100644 index 00000000..e3a93ce6 --- /dev/null +++ b/test_data/ref/chr22.hotspot_index.starch @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cfb34afebde02e4378a9ff8cdafc0c72be41c6d556e1d73936079009c543a6e +size 2490765 From 08e2c366ecdc00e3aed8fa6b3ed3118c439ce8f6 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 22 May 2023 10:14:56 -0700 Subject: [PATCH 051/172] fix: add preseq to modules for bwa aggregation --- processes/bwa/aggregate/nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/bwa/aggregate/nextflow.config b/processes/bwa/aggregate/nextflow.config index 0a5986a1..c5694d45 100644 --- a/processes/bwa/aggregate/nextflow.config +++ b/processes/bwa/aggregate/nextflow.config @@ -27,7 +27,7 @@ profiles { modules { process { withLabel: "modules" { - module = "bedops/2.4.35-typical:samtools/1.3:modwt/1.0:kentutil/302:hotspot2/2.1.1:jdk/1.8.0_92:gcc/4.7.2:R/3.2.5:picard/2.8.1:git/2.3.3:coreutils/8.25:bedtools/2.25.0:python/3.5.1:pysam/0.9.0:htslib/1.6.0:numpy/1.11.0:atlas-lapack/3.10.2:scipy/1.0.0:scikit-learn/0.18.1" + module = "bedops/2.4.35-typical:samtools/1.3:modwt/1.0:kentutil/302:hotspot2/2.1.1:jdk/1.8.0_92:gcc/4.7.2:R/3.2.5:picard/2.8.1:git/2.3.3:coreutils/8.25:bedtools/2.25.0:python/3.5.1:pysam/0.9.0:htslib/1.6.0:numpy/1.11.0:atlas-lapack/3.10.2:scipy/1.0.0:scikit-learn/0.18.1:preseq/2.0.3:gsl/2.4" } withLabel: "macs2" { module = "python/2.7.11:numpy:MACS" From f4b438dbead8c5fc6f854a622dc76f9d4110bb92 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 31 May 2023 10:28:58 -0700 Subject: [PATCH 052/172] script to create barcode report from demux stats This isn't a great solution as-is, but gives us an alternative when bcl_barcode_count doesn't work. --- .../barcode_count_from_stats_file.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 scripts/flowcells/barcode_count_from_stats_file.py diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py new file mode 100644 index 00000000..00e2933c --- /dev/null +++ b/scripts/flowcells/barcode_count_from_stats_file.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +import os, sys, re +import json +import argparse +import logging + +default_options = { + "min_count": 1000000, +} + +def parser_setup(): + parser = argparse.ArgumentParser() + # Optional + parser.add_argument("-c", "--min-count", type=int, dest="min_count", + help="The minimum number of reads to report") + # Mandatory + parser.add_argument("-s", "--stats", dest="stats_file", + required=True, + help="The JSON file to read stats from. Generally fastq/Stats/Stats.json") + parser.add_argument("-b", "--basedir", dest="base_dir", + required=True, + help="The base directory, like /net/seq/data/sequencers/DATE_A#####_####_FLOWCELL_LABEL") + parser.add_argument("-m", "--mask", dest="mask", + required=True, + help="The barcode mask, like y151,i8,i8,y151") + + parser.set_defaults( **default_options ) + return parser + +def main(): + parser = parser_setup() + poptions = parser.parse_args() + odata = { + "Lanes": [], + "Mask": poptions.mask, + "Sequencer": "NovaSeq", + "BaseDir": poptions.base_dir, + } + with open(poptions.stats_file) as f: + idata = json.load(f) + + for lane in idata["UnknownBarcodes"]: + olane = { + "LaneIndex": lane["Lane"], + "Total": None, + "Pass": None, + "Counts": { + bc.replace("+",""): { "Total": count, "Pass": count } + for (bc, count) in lane["Barcodes"].items() + if count > poptions.min_count + }, + } + olane["Total"] = sum(lane["Barcodes"].values()) + olane["Pass"] = olane["Total"] + + odata["Lanes"].append(olane) + + for conversion_result in idata["ConversionResults"]: + lane_num = conversion_result["LaneNumber"] + for sample_info in conversion_result["DemuxResults"]: + for metric_info in sample_info["IndexMetrics"]: + # Get matching count + barcode = metric_info["IndexSequence"].replace("+","") + count = metric_info["MismatchCounts"]["0"] + # Update out_data + odata["Lanes"][lane_num-1]["Counts"][barcode] = {"Total": count, "Pass": count} + odata["Lanes"][lane_num-1]["Total"] += count + odata["Lanes"][lane_num-1]["Pass"] += count + + + print(json.dumps(odata)) + +if __name__ == "__main__": + main() From 3ed32eeeb4e5acacbf813e9ca812aefffe935c2f Mon Sep 17 00:00:00 2001 From: Mark Frerker Date: Thu, 1 Jun 2023 17:06:25 -0700 Subject: [PATCH 053/172] (add) process for creating bigbeds for peak point1per files --- nextflow.config | 1 + processes/bwa/aggregate/basic.nf | 20 +++++++++++++++++++ .../aggregate/basic/attachfiles_nextflow.bash | 15 +++++++------- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/nextflow.config b/nextflow.config index 72e0de28..be94ea39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -99,4 +99,5 @@ timeline { dag { enabled = true file = "dag.html" + overwrite = true } diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index 391dbaba..50c4646a 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -910,3 +910,23 @@ process cram { -o "${cramfile}" """ } + +process starch_to_bigbed { + publishDir "${params.outdir}" + + input: + file starch_in from onepercent_peaks + + output: + file('peaks/nuclear.peaks.fdr0.001.bb') + + script: + outfile = starch_in.name.replace("starch", "bb") + chrom_sizes="/net/seq/data/genomes/human/GRCh38/noalts-sequins/GRCh38_no_alts.chrom_sizes" + """ + mkdir -p peaks + unstarch "${starch_in}" | cut -f1-4 > temp.bed + bedToBigBed temp.bed "$chrom_sizes" "peaks/$outfile" + rm temp.bed + """ +} diff --git a/scripts/bwa/aggregate/basic/attachfiles_nextflow.bash b/scripts/bwa/aggregate/basic/attachfiles_nextflow.bash index 190f4f3f..cd2944b3 100644 --- a/scripts/bwa/aggregate/basic/attachfiles_nextflow.bash +++ b/scripts/bwa/aggregate/basic/attachfiles_nextflow.bash @@ -47,13 +47,14 @@ attach_file cutcounts.bed.bgz cutcounts-tabix-bgz bgz if [[ "$PEAK_CALLER" == hotspot2 ]] ; then # hotspot2 output - attach_file $PEAKS_PREFIX.allcalls.starch hotspot-per-base starch - attach_file $PEAKS_PREFIX.hotspots.fdr0.05.starch hotspot-calls starch - attach_file $PEAKS_PREFIX.hotspots.fdr0.01.starch hotspot-calls-1per starch - attach_file $PEAKS_PREFIX.hotspots.fdr0.001.starch hotspot-calls-point1per starch - attach_file $PEAKS_PREFIX.peaks.fdr0.05.starch hotspot-peaks starch - attach_file $PEAKS_PREFIX.peaks.fdr0.01.starch hotspot-peaks-1per starch - attach_file $PEAKS_PREFIX.peaks.fdr0.001.starch hotspot-peaks-point1per starch + attach_file $PEAKS_PREFIX.allcalls.starch hotspot-per-base starch + attach_file $PEAKS_PREFIX.hotspots.fdr0.05.starch hotspot-calls starch + attach_file $PEAKS_PREFIX.hotspots.fdr0.01.starch hotspot-calls-1per starch + attach_file $PEAKS_PREFIX.hotspots.fdr0.001.starch hotspot-calls-point1per starch + attach_file $PEAKS_PREFIX.peaks.fdr0.05.starch hotspot-peaks starch + attach_file $PEAKS_PREFIX.peaks.fdr0.01.starch hotspot-peaks-1per starch + attach_file $PEAKS_PREFIX.peaks.fdr0.001.starch hotspot-peaks-point1per starch + attach_file $PEAKS_PREFIX.peaks.fdr0.001.bb hotspot-peaks-point1per-bigbed bigbed fi #TODO: We're effectively generating these twice, simplify From ad8f4d512199936d9bd08048e93d4acc892965ba Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 4 Jun 2023 12:51:30 -0700 Subject: [PATCH 054/172] starch_to_bigbed - add 'modules' label This makes sure we pull in the modules at runtime, rather than relying on having it accessible in environment. --- processes/bwa/aggregate/basic.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index 50c4646a..ba2b55c6 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -913,6 +913,7 @@ process cram { process starch_to_bigbed { publishDir "${params.outdir}" + label "modules" input: file starch_in from onepercent_peaks From 8adb188004243cdd254f338f4884f805d59f5bab Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 4 Jun 2023 12:52:41 -0700 Subject: [PATCH 055/172] starch_to_bigbed - create chrom_sizes from param This file is genome-dependent, so we need a mechanism for specifying it at runtime. We already supply the chrom_sizes.bed file with params.chrom_sizes, so we simply strip the middle '0' column from that file to create the chrom_sizes file in the format that kentUtils expects. --- processes/bwa/aggregate/basic.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/processes/bwa/aggregate/basic.nf b/processes/bwa/aggregate/basic.nf index ba2b55c6..7003fbbf 100644 --- a/processes/bwa/aggregate/basic.nf +++ b/processes/bwa/aggregate/basic.nf @@ -917,17 +917,18 @@ process starch_to_bigbed { input: file starch_in from onepercent_peaks + file chrom_sizes_bed from file(params.chrom_sizes) output: file('peaks/nuclear.peaks.fdr0.001.bb') script: outfile = starch_in.name.replace("starch", "bb") - chrom_sizes="/net/seq/data/genomes/human/GRCh38/noalts-sequins/GRCh38_no_alts.chrom_sizes" """ + cut -f1,3 "$chrom_sizes_bed" > chrom_sizes mkdir -p peaks unstarch "${starch_in}" | cut -f1-4 > temp.bed - bedToBigBed temp.bed "$chrom_sizes" "peaks/$outfile" + bedToBigBed temp.bed chrom_sizes "peaks/$outfile" rm temp.bed """ } From 78a5c2ead70cf05fcc52c4d03f4dd530dd2d1d53 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 4 Jun 2023 12:55:20 -0700 Subject: [PATCH 056/172] v2.6.0 for DNase aggregation New minor release. Feature: The fdr=one-percent peaks file is now available in bigbed format. This feature is motivated by use by trackhubs in the genome browser. --- processes/bwa/aggregate/basic.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/bwa/aggregate/basic.bash b/processes/bwa/aggregate/basic.bash index 87a543d0..63b865cc 100644 --- a/processes/bwa/aggregate/basic.bash +++ b/processes/bwa/aggregate/basic.bash @@ -1,6 +1,6 @@ #!/bin/bash -version=2.5.1 +version=2.6.0 export NXF_VER=18.10.1 # The version of nextflow to run. 18.10.1 includes conda cd "$(dirname "$0")" From 9bd3384fd5b8cf6c1b5cf922e1e3b39f1a46ec73 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 8 Jun 2023 16:19:49 -0700 Subject: [PATCH 057/172] Add altcode process for alignment & quantification --- processes/altcode/altcode.nf | 112 +++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 processes/altcode/altcode.nf diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf new file mode 100644 index 00000000..20b935d3 --- /dev/null +++ b/processes/altcode/altcode.nf @@ -0,0 +1,112 @@ +nextflow.enable.dsl=2 + +// Workflows +workflow { + + STAR_solo( + [ + tokenize_read_files(params.r1), tokenize_read_files(params.r2), + params.barcodes_r1, params.barcodes_r2, params.barcodes_r3, + [78, 8], + [48, 8], + [10, 8], + ], + file("${params.genome_dir}/*") + ) + +} + +// Helper functions +def tokenize_read_files(input) { + if (input in String) { + return input.tokenize(",") + } + return input +} + +def join_list_commas(input) { + // TODO: Do we need to handle different parameter passing styles differently? + return input.join(",") +} + +def pos_to_str(start, length) { + return "0_${start}_0_${start+length-1}" +} + +/// Processing +/// This process creates the Aligned.out.cram file and STARsolo analysis results +process STAR_solo { + + module 'STAR/2.7.9a' + publishDir "test-out" + + input: + tuple( + path(r1), path(r2), // r1 and r2 may each receive multiple files + path(r1_barcodes), path(r2_barcodes), path(r3_barcodes), + val(r1_barcode_pos), val(r2_barcode_pos), val(r3_barcode_pos) + ) + path("ref/*") + + output: + path "output/Aligned.out.cram*", emit: cram + path "output/Solo.out", emit: solo_analysis + + + script: + // TODO: How do we dynamically determine this? + // barcode_positions = "0_10_0_17 0_48_0_55 0_78_0_85" + bc1_position = pos_to_str(*r1_barcode_pos) + bc2_position = pos_to_str(*r2_barcode_pos) + bc3_position = pos_to_str(*r3_barcode_pos) + umi_position = pos_to_str(0, 10) + + //bc1_position = pos_to_str(78, 8) + //bc2_position = pos_to_str(48, 8) + //bc3_position = pos_to_str(10, 8) + //umi_position = pos_to_str(0, 10) + + // TODO: Determine from environment? + bam_sort_RAM = 32_000_000_000 + + r1_files = join_list_commas(r1) + r2_files = join_list_commas(r2) + + num_threads = 10 + + """ + mkdir -p output + mkfifo output/Aligned.out.bam + STAR \ + --genomeDir "ref" \ + --readFilesIn "${r1_files}" "${r2_files}" \ + --soloType CB_UMI_Complex \ + --soloCBposition "${bc3_position}" "${bc2_position}" "${bc1_position}" \ + --soloCBwhitelist "${r3_barcodes}" "${r2_barcodes}" "${r1_barcodes}" \ + --soloUMIposition "${umi_position}" \ + --soloCBmatchWLtype 1MM \ + --soloUMIdedup 1MM_All \ + --soloFeatures Gene GeneFull SJ \ + --runThreadN "${num_threads}" \ + --limitBAMsortRAM "${bam_sort_RAM}" \ + --outSAMtype BAM Unsorted \ + --outSAMattributes NH HI AS nM CR CY UR UY sM \ + --outBAMcompression -1 \ + - outBAMsortingThreadN "${num_threads}" \ + --readFilesCommand zcat \ + --outFileNamePrefix output/ \ + --limitOutSJcollapsed 5000000 & + + samtools sort \ + --reference /net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified \ + -o output/Aligned.out.cram \ + --output-fmt-option "version=3.0,level=7" \ + --threads "${num_threads}" \ + --write-index \ + -T "tmpsort" \ + output/Aligned.out.bam & + + wait + rm output/Aligned.out.bam + """ +} From d2181d44c1d018bfc43ec52546631bd17c848b54 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 11 Jun 2023 14:10:22 -0700 Subject: [PATCH 058/172] Altcode: Wire in rest of params --- processes/altcode/altcode.nf | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 20b935d3..b31bad57 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -3,15 +3,21 @@ nextflow.enable.dsl=2 // Workflows workflow { + def meta = [:] + def ref_files = file("${params.genome_dir}/*") + STAR_solo( [ + meta, tokenize_read_files(params.r1), tokenize_read_files(params.r2), - params.barcodes_r1, params.barcodes_r2, params.barcodes_r3, - [78, 8], - [48, 8], - [10, 8], + params.barcode_r1_list, params.barcode_r2_list, params.barcode_r3_list, + [params.barcode_r1_pos, params.barcode_r1_len], + [params.barcode_r2_pos, params.barcode_r2_len], + [params.barcode_r3_pos, params.barcode_r3_len], + [params.umi_pos, params.umi_len], + ref_files, + params.genome_fasta, ], - file("${params.genome_dir}/*") ) } @@ -42,15 +48,18 @@ process STAR_solo { input: tuple( + val(meta), path(r1), path(r2), // r1 and r2 may each receive multiple files path(r1_barcodes), path(r2_barcodes), path(r3_barcodes), - val(r1_barcode_pos), val(r2_barcode_pos), val(r3_barcode_pos) + val(r1_barcode_pos), val(r2_barcode_pos), val(r3_barcode_pos), + val(umi_barcode_pos), + path("ref/*"), + path(genome_fasta), ) - path("ref/*") output: - path "output/Aligned.out.cram*", emit: cram - path "output/Solo.out", emit: solo_analysis + tuple(val(meta), path("output/Aligned.out.cram*"), emit: cram) + tuple(val(meta), path("output/Solo.out"), emit: solo_analysis) script: @@ -59,12 +68,7 @@ process STAR_solo { bc1_position = pos_to_str(*r1_barcode_pos) bc2_position = pos_to_str(*r2_barcode_pos) bc3_position = pos_to_str(*r3_barcode_pos) - umi_position = pos_to_str(0, 10) - - //bc1_position = pos_to_str(78, 8) - //bc2_position = pos_to_str(48, 8) - //bc3_position = pos_to_str(10, 8) - //umi_position = pos_to_str(0, 10) + umi_position = pos_to_str(*umi_barcode_pos) // TODO: Determine from environment? bam_sort_RAM = 32_000_000_000 @@ -98,7 +102,7 @@ process STAR_solo { --limitOutSJcollapsed 5000000 & samtools sort \ - --reference /net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified \ + --reference "${genome_fasta}" \ -o output/Aligned.out.cram \ --output-fmt-option "version=3.0,level=7" \ --threads "${num_threads}" \ From ae7087424c4fdc8c8ab6f9e80cf37a765a687954 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 19 Jul 2023 15:05:05 -0700 Subject: [PATCH 059/172] Add script to create multiple samplesheets This will help alleviate the manual process of creating several samplesheets when several types of barcode strategies are used on the same flowcell. --- scripts/flowcells/make_samplesheets.py | 252 +++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100755 scripts/flowcells/make_samplesheets.py diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py new file mode 100755 index 00000000..980c2420 --- /dev/null +++ b/scripts/flowcells/make_samplesheets.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 + +import argparse +import datetime +import json +import re +import sys +import textwrap + +from collections import defaultdict + + +# Usage: $0 -p processing.json + +SCRIPT_OPTIONS = { + "processing": "processing.json", + "reverse_barcode1": False, + "reverse_barcode2": False, +} + +def parser_setup(): + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--processing", dest="processing", + help="The JSON file to read barcodes from (default: processing.json)") + parser.add_argument("--reverse_barcode1", dest="reverse_barcode1", action="store_true", + help="Use reverse sequence for barcode1") + parser.add_argument("--reverse_barcode2", dest="reverse_barcode2", action="store_true", + help="Use reverse sequence for barcode2") + parser.set_defaults(**SCRIPT_OPTIONS) + return parser + + +def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2: bool) -> "[dict]": + assignments = [] + # Initialize our library pool lookup tables + pools = data["library_pools"] + libs_to_pools = dict() + for (pool_name, pool_data) in pools.items(): + for lib_str in pool_data["libraries"]: + lib_num = int(lib_str.replace("LN", "")) + if lib_num in libs_to_pools: + raise Exception("library {} in more than one pool".format(lib_str)) + libs_to_pools[lib_num] = (pool_name, + pool_data.get("barcode1"), + pool_data.get("barcode2")) + + # This will store our pool samplesheet lines + pool_assignment_set = set() + + for libdata in data["libraries"]: + # Skip libraries in pools + lib_num = libdata.get('library') + pool_data = libs_to_pools.get(lib_num) + if pool_data is None: + assignment = { + "lane": libdata.get("lane"), + "sample": libdata.get("samplesheet_name"), + "barcode1": "", + "barcode2": "", + } + if libdata.get("barcode1") is not None: + assignment["barcode1"] = libdata["barcode1"]["reverse_sequence"] if reverse_barcode1 else libdata["barcode1"]["sequence"] + if libdata.get("barcode2") is not None: + assignment["barcode2"] = libdata["barcode2"]["reverse_sequence"] if reverse_barcode2 else libdata["barcode2"]["sequence"] + + assignments.append(assignment) + else: + pool_assignment_set.add( + (libdata.get("lane"), *libs_to_pools[lib_num]) + ) + # Turn set of tuples into list of dicts + pool_assignments = [{ + "lane": a[0], + "sample": a[1], + "barcode1": a[2], + "barcode2": a[3], + } for a in pool_assignment_set] + + return assignments + pool_assignments + + +def make_samplesheet_header(name: str, date: str) -> str: + template = textwrap.dedent("""\ + [Header] + Investigator Name,{name} + Project Name,{name} + Experiment Name,{name} + Date,{date} + Workflow,GenerateFASTQ + + [Settings] + + [Data] + Lane,SampleID,SampleName,index,index2 + """) + return template.format(date=date, name=name) + + +def group_assignments(assignments: "[dict]") -> "[[dict]]": + """ Groups the barcode assignments by length """ + barcode_length_combinations = defaultdict(list) + def get_len(d): + return 0 if d is None else len(d) + + for assignment in assignments: + key = ( + get_len(assignment["barcode1"]), + get_len(assignment["barcode2"]), + ) + barcode_length_combinations[key].append(assignment) + return barcode_length_combinations + +def parse_mask(mask: str) -> "[[(str, int)]]": + parts = [] + str_parts = mask.split(",") + regex = r'(?P[yni])(?P[0-9]*)' + for part in str_parts: + pieces = [] + for match in re.finditer(regex, part, flags=re.I): + letter = match.group("letter").lower() + num_str = match.group("num") + num = 1 if len(num_str) == 0 else int(num_str) + if pieces and pieces[-1][0] == letter: + # Collapse same-letter adjacent pieces + pieces[-1][1] += num + else: + pieces.append((letter, num)) + parts.append(pieces) + return parts + +def mask_to_str(mask: "[[(str, int)]]") -> str: + """ Convert a mask in parts back into a string """ + def format_piece(letter, num): + if num == 0: + return "" + elif num == 1: + return letter + else: + return letter + str(num) + + return ",".join([ + "".join([ + format_piece(*piece) + for piece in part + ]) + for part in mask + ]) + + +def adjust_mask_for_lengths(mask_parts, len1, len2): + """ + Takes in a barcode-mask (in parts) and the barcode length, and adjusts the + values of 'i' to match. + """ + new_mask = [] + index_reads_seen = 0 + for read in mask_parts: + read_len = sum(piece[1] for piece in read) + is_index_read = any(piece[0] == "i" for piece in read) + if is_index_read: + if any(piece[0] == "y" for piece in read): + raise Exception("Mixed read/index in barcode mask '{}', don't know how to deal with this".format(mask_to_str(mask_parts))) + index_reads_seen += 1 + if index_reads_seen == 1: + # first barcode + if len1 == read_len: + new_mask.append([("i", len1)]) + elif len1 > read_len: + new_mask.append([("i", read_len)]) + elif len1 < read_len: + new_mask.append([("i", len1), ("n", read_len - len1)]) + elif index_reads_seen == 2: + # second barcode + if len2 == read_len: + new_mask.append([("i", len2)]) + elif len2 > read_len: + new_mask.append([("i", read_len)]) + elif len2 < read_len: + new_mask.append([("i", len2), ("n", read_len - len2)]) + else: + new_mask.append(read) + return new_mask + + +def write_samplesheets(name, date, root_mask, assignments): + """ Write out the sample sheets """ + mask_parts = parse_mask(root_mask) + max_bclen1 = 0 + max_bclen2 = 0 + index_reads_seen = 0 + for read in mask_parts: + read_len = sum(piece[1] for piece in read) + is_index_read = any(piece[0] == "i" for piece in read) + if is_index_read: + index_reads_seen += 1 + if index_reads_seen == 1: + max_bclen1 = read_len + if index_reads_seen == 2: + max_bclen2 = read_len + + for assign in assignments: + assign['barcode1'] = assign['barcode1'][:max_bclen1] + assign['barcode2'] = assign['barcode2'][:max_bclen2] + # Trim barcodes to make sure they fit in the read + + groups = group_assignments(assignments) + + for (barcode_lengths, assigns) in groups.items(): + new_mask = adjust_mask_for_lengths(mask_parts, *barcode_lengths) + header = make_samplesheet_header(name, date) + body = make_samplesheet_body(assigns) + samplesheet_contents = header + body + filename = "SampleSheet.withmask.{}.csv".format(mask_to_str(new_mask)) + with open(filename, "w") as f: + f.write(samplesheet_contents) + +def make_samplesheet_body(barcode_assignments: "[dict]") -> str: + """ Create samplesheet text from assignments """ + lines = [] + for ba in barcode_assignments: + line = ",".join([ + str(ba["lane"]), + ba["sample"], + ba["sample"], + str(ba["barcode1"]), + str(ba["barcode2"]), + "", + ]) + lines.append(line) + return "\n".join(sorted(lines)) + +def main(args=sys.argv): + parser = parser_setup() + poptions = parser.parse_args(args) + + process_json = open(poptions.processing) + data = json.load(process_json) + process_json.close() + + assignments = get_barcode_assignments(data, + poptions.reverse_barcode1, + poptions.reverse_barcode2, + ) + mask = data["alignment_group"]["bases_mask"] + write_samplesheets(name="Altius", + date=str(datetime.date.today()), + root_mask=mask, + assignments=assignments) + + +if __name__ == "__main__": + main() From 530da1cdcaecf35661852ac7410d101e33172a21 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 20 Jul 2023 13:09:42 -0700 Subject: [PATCH 060/172] altcode -minor output publishing changes --- processes/altcode/altcode.nf | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index b31bad57..77ac9182 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -1,6 +1,11 @@ nextflow.enable.dsl=2 -// Workflows +params.outdir = "output" + +/// Workflows + +/// Default workflow +/// Processes a single sample workflow { def meta = [:] @@ -44,7 +49,8 @@ def pos_to_str(start, length) { process STAR_solo { module 'STAR/2.7.9a' - publishDir "test-out" + publishDir params.outdir + cpus 10 input: tuple( @@ -58,8 +64,8 @@ process STAR_solo { ) output: - tuple(val(meta), path("output/Aligned.out.cram*"), emit: cram) - tuple(val(meta), path("output/Solo.out"), emit: solo_analysis) + tuple(val(meta), path("Aligned.out.cram*"), emit: cram) + tuple(val(meta), path("Solo.out"), emit: solo_analysis) script: @@ -79,8 +85,8 @@ process STAR_solo { num_threads = 10 """ - mkdir -p output - mkfifo output/Aligned.out.bam + set -e + mkfifo Aligned.out.bam STAR \ --genomeDir "ref" \ --readFilesIn "${r1_files}" "${r2_files}" \ @@ -95,22 +101,22 @@ process STAR_solo { --limitBAMsortRAM "${bam_sort_RAM}" \ --outSAMtype BAM Unsorted \ --outSAMattributes NH HI AS nM CR CY UR UY sM \ - --outBAMcompression -1 \ + --outBAMcompression 0 \ - outBAMsortingThreadN "${num_threads}" \ --readFilesCommand zcat \ - --outFileNamePrefix output/ \ + --outFileNamePrefix ./ \ --limitOutSJcollapsed 5000000 & samtools sort \ --reference "${genome_fasta}" \ - -o output/Aligned.out.cram \ + -o Aligned.out.cram \ --output-fmt-option "version=3.0,level=7" \ --threads "${num_threads}" \ --write-index \ -T "tmpsort" \ - output/Aligned.out.bam & + Aligned.out.bam & wait - rm output/Aligned.out.bam + rm Aligned.out.bam """ } From 4a064654f0dc9dd1fab82494c5b1ced47bec58df Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 09:50:18 -0700 Subject: [PATCH 061/172] make_samplesheets.py fixups --- scripts/flowcells/make_samplesheets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index 980c2420..045f8501 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -37,7 +37,7 @@ def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2 libs_to_pools = dict() for (pool_name, pool_data) in pools.items(): for lib_str in pool_data["libraries"]: - lib_num = int(lib_str.replace("LN", "")) + lib_num = int(re.sub("[A-Z]", "", lib_str)) if lib_num in libs_to_pools: raise Exception("library {} in more than one pool".format(lib_str)) libs_to_pools[lib_num] = (pool_name, @@ -231,7 +231,7 @@ def make_samplesheet_body(barcode_assignments: "[dict]") -> str: def main(args=sys.argv): parser = parser_setup() - poptions = parser.parse_args(args) + poptions = parser.parse_args() process_json = open(poptions.processing) data = json.load(process_json) From 10132f5155e0d85d23378ac02150400ad8016791 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 09:53:47 -0700 Subject: [PATCH 062/172] workaround: --allow_collisions for max_mismatch.py Adds a new option to max_mismatch.py to proceed even if barcode collisions appear to exist. This script should probably be refactored to take library pool information into account. --- scripts/flowcells/max_mismatch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index baf1afb3..a5c3b4b6 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -25,7 +25,8 @@ def parser_setup(): parser.add_argument("-p", "--processing", dest="processing", help="The JSON file to read barcodes from") parser.add_argument("--ignore_failed_lanes", dest="ignore_failed_lanes", action="store_true", default=False, - help="Ignore failed lanes when calculating max mismatch.") + parser.add_argument("--allow_collisions", dest="allow_collisions", action="store_true", default=False, + help="Don't exit with error even if collisions are found (workaround)") parser.set_defaults( **script_options ) return parser @@ -129,7 +130,7 @@ def main(args = sys.argv): mismatch_level = get_max_mismatch_level( lanes, len(mask) ) - if not mismatch_level: + if not mismatch_level and not poptions.allow_collisions: sys.stderr.write("No allowable mismatch levels found, barcode collision?\n") sys.exit(1) From 23da4577c88d1885ee522b9e1b3aa9b9f43f2c95 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 11:28:40 -0700 Subject: [PATCH 063/172] flowcells/setup.sh: Support alt-code novaseq runs Initial support, may require further revision. --- scripts/flowcells/setup.sh | 282 ++++++++++--------------------------- 1 file changed, 77 insertions(+), 205 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 10aaf44f..960ee331 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -1,5 +1,6 @@ #!/bin/bash # shellcheck disable=SC1090 +# shellcheck disable=SC2162 set -o errexit set -o pipefail @@ -63,6 +64,50 @@ while getopts ":hvdxf:" opt ; do esac done +# Long command definitions +# The quadruple-backslash syntax on this is messy and gross. +# It works, though, and the output is readable. +# read -d '' always exits with status 1, so we ignore error +# We split threads equally between processing and loading+writing. +set +e +read -d '' regular_bcl_command << _REG_BCL_CMD_ + PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH + bcl2fastq \\\\ + --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ + --use-bases-mask "$bcl_mask" \\\\ + --output-dir "$fastq_dir" \\\\ + --barcode-mismatches "$mismatches" \\\\ + --writing-threads 0 \\\\ + --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ + --processing-threads \\\$SLURM_CPUS_PER_TASK +_REG_BCL_CMD_ + +read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ + PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH + for samplesheet in \$PWD/SampleSheet.withmask*csv ; do + bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) + fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask") + bcl2fastq \\\\ + --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ + --use-bases-mask "$bcl_mask" \\\\ + --output-dir "${illumina_dir}/\$fastq_dir" \\\\ + --barcode-mismatches "$mismatches" \\\\ + --output-dir "${illumina_dir}" \\\\ + --sample-sheet "${illumina_dir}/\$samplesheet" \\\\ + --writing-threads 0 \\\\ + --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ + --processing-threads \\\$SLURM_CPUS_PER_TASK + done +_NOVA_BCL_CMD_ + +read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' +for fq_dir in fastq* ; + [[ -d $fq_dir ]] || continue + python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_folder" -o Demultiplexed -p processing.json +done +_NOVA_LINK_CMD_ +set -e + if [ -z "$flowcell" ] ; then echo "No flowcell label specified" flowcell=$(basename "$PWD" | cut -f4 -d_ | cut -c2-10) @@ -110,32 +155,6 @@ make_hiseq_samplesheet(){ } -make_novaseq_samplesheet(){ - lanecount=$1 - name=Stamlab - date=$(date '+%m/%d/%Y') - cat <<__SHEET__ -[Header] -Investigator Name,$name -Project Name,$name -Experiment Name,$name -Date,$date -Workflow,GenerateFASTQ - -[Settings] - -[Data] -Lane,SampleID,SampleName,index,index2 -__SHEET__ -for i in $(seq $lanecount) ; do - echo "$i,none,none,GGGGGGGG,GGGGGGGG" -done - -if [ -z "$demux" ] ; then - # This bit of cryptic magic generates the samplesheet part. - jq -r '.libraries[] | select(.failed == false) | [(.lane|tostring), .samplesheet_name,.samplesheet_name,.barcode1.reverse_sequence, .barcode2.reverse_sequence,""] | join(",") ' "$json" -fi -} make_nextseq_samplesheet(){ name=Stamlab @@ -247,152 +266,65 @@ if [ -z "$demux" ] ; then else # Set some options for manual demultiplexing bcl_mask=$(tr Nn Ii <<< $mask) mismatches="0,0" - dmx_mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes | cut -c1 ) + dmx_mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes --allow_collisions | cut -c1 ) fi case $run_type in "Novaseq 6000 S1") echo "Novaseq 6000: S1 (non-pooled)" + unset demux parallel_env="-pe threads 6" - link_command="" + link_command=$novaseq_link_command samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="hpcz-2" - make_novaseq_samplesheet 2 > SampleSheet.csv + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - - set +e - read -d '' unaligned_command << _U_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$novaseq_bcl_command ;; "Novaseq 6000 S2") echo "Novaseq 6000: S2 (non-pooled)" + unset demux parallel_env="-pe threads 6" - link_command="" + link_command=$novaseq_link_command samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="hpcz-2" - make_novaseq_samplesheet 2 > SampleSheet.csv + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - - # The quadruple-backslash syntax on this is messy and gross. - # It works, though, and the output is readable. - # read -d '' always exits with status 1, so we ignore error - - # The NSLOTS lines are for scaling the various threads (2 per slot). - # WARNING: Does not work for threads < 4 - # Table: - # NSLOTS l w d p total - # 4 1 1 2 4 = 8 - # 5 1 1 2 5 = 9 - # 6 2 2 3 6 = 13 - # 7 2 2 3 7 = 14 - # 8 2 2 4 8 = 16 - set +e - read -d '' unaligned_command << _U_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$novaseq_bcl_command ;; "Novaseq 6000 S4") echo "Novaseq 6000: S4 (non-pooled)" + unset demux parallel_env="-pe threads 6" - link_command="" + link_command=$novaseq_link_command samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="hpcz-2" - make_novaseq_samplesheet 4 > SampleSheet.csv + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - - # The quadruple-backslash syntax on this is messy and gross. - # It works, though, and the output is readable. - # read -d '' always exits with status 1, so we ignore error - - # The NSLOTS lines are for scaling the various threads (2 per slot). - # WARNING: Does not work for threads < 4 - # Table: - # NSLOTS l w d p total - # 4 1 1 2 4 = 8 - # 5 1 1 2 5 = 9 - # 6 2 2 3 6 = 13 - # 7 2 2 3 7 = 14 - # 8 2 2 4 8 = 16 - set +e - read -d '' unaligned_command << _U_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$novaseq_bcl_command ;; "Novaseq 6000 SP") echo "Novaseq 6000: SP (non-pooled)" + unset demux parallel_env="-pe threads 6" - link_command="" + link_command=$novaseq_link_command samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="hpcz-2" - make_novaseq_samplesheet 4 > SampleSheet.csv + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - - # The quadruple-backslash syntax on this is messy and gross. - # It works, though, and the output is readable. - # read -d '' always exits with status 1, so we ignore error - - # The NSLOTS lines are for scaling the various threads (2 per slot). - # WARNING: Does not work for threads < 4 - # Table: - # NSLOTS l w d p total - # 4 1 1 2 4 = 8 - # 5 1 1 2 5 = 9 - # 6 2 2 3 6 = 13 - # 7 2 2 3 7 = 14 - # 8 2 2 4 8 = 16 - set +e - read -d '' unaligned_command << _U_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$novaseq_bcl_command ;; "NextSeq 500") @@ -406,33 +338,7 @@ _U_ queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 - - # The quadruple-backslash syntax on this is messy and gross. - # It works, though, and the output is readable. - # read -d '' always exits with status 1, so we ignore error - - # The NSLOTS lines are for scaling the various threads (2 per slot). - # WARNING: Does not work for threads < 4 - # Table: - # NSLOTS l w d p total - # 4 1 1 2 4 = 8 - # 5 1 1 2 5 = 9 - # 6 2 2 3 6 = 13 - # 7 2 2 3 7 = 14 - # 8 2 2 4 8 = 16 - set +e - read -d '' unaligned_command << _U_ - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --demultiplexing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$regular_bcl_command ;; "HiSeq 4000") echo "Hiseq 4000 run detected" @@ -444,32 +350,7 @@ _U_ queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1-8 - - # The quadruple-backslash syntax on this is messy and gross. - # It works, though, and the output is readable. - # read -d '' always exits with status 1, so we ignore error - - # The NSLOTS lines are for scaling the various threads (2 per slot). - # WARNING: Does not work for threads < 4 - # Table: - # NSLOTS l w d p total - # 4 1 1 2 4 = 8 - # 5 1 1 2 5 = 9 - # 6 2 2 3 6 = 13 - # 7 2 2 3 7 = 14 - # 8 2 2 4 8 = 16 - set +e - read -d '' unaligned_command << _U_ - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --demultiplexing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ + unaligned_command=$regular_bcl_command ;; "MiniSeq High Output Kit DNase") # Identical to nextseq processing @@ -482,19 +363,7 @@ _U_ queue="queue0" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 - set +e - read -d '' unaligned_command << _U_ - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --loading-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --writing-threads \\\$(( SLURM_CPUS_PER_TASK / 4 )) \\\\ - --demultiplexing-threads \\\$(( SLURM_CPUS_PER_TASK / 2 )) \\\\ - --processing-threads \\\$(( SLURM_CPUS_PER_TASK )) -_U_ - set -e + unaligned_command=$regular_bcl_command ;; "MiniSeq Mid Output Kit GUIDEseq") # Identical to nextseq processing @@ -605,6 +474,8 @@ if [ -n "$demux" ] ; then # obsolete now? demux_cmd="$STAMPIPES/scripts/flowcells/demux_flowcell.sh -i $fastq_dir -o $copy_from_dir -p $json -q $queue -m $dmx_mismatches" link_command="#Demuxing happened, no linking to do" +elif [[ "$bc_flag" == "--novaseq" ]] ; then + copy_from_dir="$(pwd)/Demultiplexed/" fi flowcell_id=$( curl \ @@ -665,12 +536,12 @@ __FASTQ__ __BCL2FASTQ__ else - +# Not miniseq cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash source $MODULELOAD -module load bcl2fastq2/2.17.1.14 +module load bcl2fastq2/2.20.0.422 source $PYTHON3_ACTIVATE source $STAMPIPES/scripts/lims/api_functions.sh @@ -734,14 +605,14 @@ __PART2__ __BCL2FASTQ__ -cat > run_bcl2fastq_2.sh <<__BCL2FASTQ__ +cat > run_bcl2fastq_2.sh <<__BCL2FASTQ2__ # !/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -source $MODULELOAD -module load bcl2fastq2/2.17.1.14 -source $PYTHON3_ACTIVATE -source $STAMPIPES/scripts/lims/api_functions.sh +source "$MODULELOAD" +source "$PYTHON3_ACTIVATE" +source "$STAMPIPES/scripts/lims/api_functions.sh" +if [[ -n "$demux" ]] ; then # demultiplex if [ -d "$fastq_dir.L001" ] ; then inputfiles=(\$(find $fastq_dir.L00[1-9] -name "*Undetermined_*fastq.gz" -size +0 )) @@ -773,6 +644,7 @@ __DEMUX__ ) DEMUX_JOBIDS="\$DEMUX_JOBIDS,\$jobid" done +fi if [[ -n \$DEMUX_JOBIDS ]]; then dmx_dependency=\$(echo \$DEMUX_JOBIDS | sed -e 's/,/,afterok:/g' | sed -e 's/^,afterok/--dependency=afterok/g') @@ -782,6 +654,7 @@ fi copy_jobid=\$(sbatch --export=ALL -J "c-$flowcell" \$dmx_dependency -o "c-$flowcell.o%A" -e "c-$flowcell.e%A" --partition=$queue --cpus-per-task=1 --ntasks=1 --mem-per-cpu=1000 --parsable --oversubscribe <<'__COPY__' #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" +$link_command # copy files mkdir -p "$analysis_dir" @@ -807,14 +680,13 @@ rsync -avP "$samplesheet" "$analysis_dir" fi destination=\$destination/\$dir mkdir -p "\$destination" - rsync -a "\$dir/" "\$destination/" + rsync -aL "\$dir/" "\$destination/" done ) # create fastqc and collation scripts cd "$analysis_dir" -$link_command # Remove existing scripts if they exist (to avoid appending) rm -f fastqc.bash collate.bash run.bash From 01b7be3bbcd1d5b80cbb3add18bea2cf13c37758 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 11:29:58 -0700 Subject: [PATCH 064/172] setup.sh - fixups for some long-running issues increase bcl threads when run without -x, only wait if prepare_for_processing times out some various comments/clarity --- scripts/flowcells/setup.sh | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 960ee331..613bbd74 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -202,13 +202,18 @@ illumina_dir=$(pwd) link_command="#no linking to do" source "$STAMPIPES/scripts/lims/api_functions.sh" -lims_put_by_url "$(lims_get_all "flowcell_run/?label=$flowcell" | jq -r .url)prepare_for_processing/" - -# Make sure that "Prepare for Processing" has completed. -if [[ -z "$nosleep" ]] ; then - echo "sleeping for 5 minutes to wait for LIMS to set up... (skip with -x if you're sure it's ready)" - sleep 300 -fi +( + set -e + url=$(lims_get_all "flowcell_run/?label=$flowcell" | jq -r .url) + lims_put_by_url "${url}prepare_for_processing/" +) || ( + # Make sure that "Prepare for Processing" has completed. + if [[ -z "$nosleep" ]] ; then + echo "Prepare for processing is taking a while to complete..." + echo "sleeping for 5 minutes to wait for LIMS to set up... (skip with -x if you're sure it's ready)" + sleep 300 + fi +) # Get and read the processing script python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" @@ -249,7 +254,7 @@ bash "$runscript" EOF __BCL2FASTQ__ echo "Run $runscript to start analysis!" - + exit 0 fi @@ -565,7 +570,7 @@ lims_patch "flowcell_run/$flowcell_id/" "folder_name=${PWD##*/}" # Submit a barcode job for each mask for bcmask in $(python $STAMPIPES/scripts/flowcells/barcode_masks.py | xargs) ; do export bcmask - bcjobid=\$(sbatch --export=ALL -J "bc-$flowcell" -o "bc-$flowcell.o%A" -e "bc-$flowcell.e%A" --partition=$queue --cpus-per-task=1 --ntasks=1 --mem-per-cpu=64000 --parsable --oversubscribe --mail-type=FAIL --mail-user=sequencing@altius.org <<'__BARCODES__' + bcjobid=\$(sbatch --export=ALL -J "bc-$flowcell" -o "bc-$flowcell.o%A" -e "bc-$flowcell.e%A" --partition=$queue --cpus-per-task=10 --ntasks=1 --mem-per-cpu=6400 --parsable --oversubscribe --mail-type=FAIL --mail-user=sequencing@altius.org <<'__BARCODES__' #!/bin/bash bcl_barcode_count --mask=\$bcmask $bc_flag > barcodes.\$bcmask.json python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json @@ -580,10 +585,8 @@ __BARCODES__ PROCESSING="\$PROCESSING,\$bcjobid" done -dependencies_barcodes=\$(echo \$PROCESSING | sed -e 's/,/,afterok:/g' | sed -e 's/^,afterok/--dependency=afterok/g') - # bcl2fastq -bcl_jobid=\$(sbatch --export=ALL -J "u-$flowcell" -o "u-$flowcell.o%A" -e "u-$flowcell.e%A" \$dependencies_barcodes --partition=$queue --ntasks=1 --cpus-per-task=4 --mem-per-cpu=8000 --parsable --oversubscribe <<'__FASTQ__' +bcl_jobid=\$(sbatch --export=ALL -J "u-$flowcell" -o "u-$flowcell.o%A" -e "u-$flowcell.e%A" --partition=$queue --ntasks=1 --cpus-per-task=20 --mem-per-cpu=8000 --parsable --oversubscribe <<'__FASTQ__' #!/bin/bash set -x -e -o pipefail @@ -761,14 +764,14 @@ python3 "$STAMPIPES/scripts/alignprocess.py" \ --outfile run_alignments.bash # Set up of flowcell aggregations -curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token $LIMS_API_TOKEN" +curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" # Run alignments bash run_alignments.bash __COLLATE__ -__BCL2FASTQ__ +__BCL2FASTQ2__ fi From e7ec83dc1d262f5e796374d1f2c971ff1816bcfd Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 11:45:02 -0700 Subject: [PATCH 065/172] link_nextseq.py: Python3 compat and formatting --- scripts/flowcells/link_nextseq.py | 144 +++++++++++++++++++----------- 1 file changed, 92 insertions(+), 52 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 57d21afb..d88ed23a 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -1,15 +1,16 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals -import os, sys, logging, re -#import requests -import json -import fileinput import argparse import glob +import json +import logging +import os +import re -log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -script_options = { +SCRIPT_OPTIONS = { "quiet": False, "debug": False, "base_dir": os.getcwd(), @@ -17,55 +18,89 @@ "dry_run": False, } + def parser_setup(): + """ Sets up parser """ parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-i", "--input-dir", dest="input_dir", - help="The input directory to use.") - parser.add_argument("-o", "--output-dir", dest="output_dir", - help="The output directory to use.") - parser.add_argument("-p", "--processing_file", dest="processing_file", - help="The processing_file to use as a guide.") - - parser.add_argument("--dry-run", dest="dry_run", action="store_true", - help="Only print out planned symlinks.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-i", "--input-dir", dest="input_dir", help="The input directory to use." + ) + parser.add_argument( + "-o", "--output-dir", dest="output_dir", help="The output directory to use." + ) + parser.add_argument( + "-p", + "--processing_file", + dest="processing_file", + help="The processing_file to use as a guide.", + ) + + parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + help="Only print out planned symlinks.", + ) + + parser.set_defaults(**SCRIPT_OPTIONS) + parser.set_defaults(quiet=False, debug=False) return parser -def create_links(lane, read, input_basedir, output_basedir, dry_run = False, undetermined = False): + +def create_links( + lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False + ): + """ + Create the links between the input directories and output dir + If dry_run is passed, will print them instead of creating them + """ sample_name = lane["alignments"][0]["sample_name"] short_name = lane["samplesheet_name"] if undetermined: - output_dir = os.path.join( output_basedir, "Undetermined_indices", "Sample_lane1") + output_dir = os.path.join( + output_basedir, "Undetermined_indices", "Sample_lane1" + ) else: - output_dir = os.path.join( output_basedir, "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"] ) - - # if nextseq - if True: - short_name = re.sub(r"_", '-', short_name) - input_dir = input_basedir - input_wildcard = os.path.join(input_dir, "%s_S*_L00?_%s_???.fastq.gz" % (short_name, read)) - else: # eventually could be highseq rapid run linking... have to make some changes - input_dir = os.path.join( input_basedir, "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"] ) - input_wildcard = os.path.join(input_dir, "%s_%s_???.fastq.gz" % (sample_name, read)) + output_dir = os.path.join( + output_basedir, + "Project_%s" % lane["project"], + "Sample_%s" % lane["samplesheet_name"], + ) + + short_name = re.sub(r"_", "-", short_name) + input_dir = input_basedir + input_wildcard = os.path.join( + input_dir, "%s_S*_L00?_%s_???.fastq.gz" % (short_name, read) + ) if not dry_run and not os.path.isdir(output_dir): os.makedirs(output_dir) - + # This will fail if we have the same sample listed multiple times in the - # samplesheet (run with different barcodes). - # But I've never seen that happen. + # samplesheet (run with different barcodes). + # (Since we use the library letter in the sample name, this would imply + # that the library has multiple barcodes, which isn't a thing that makes + # sense in our system) input_fastq = sorted(glob.glob(input_wildcard)) for idx, input_file in enumerate(input_fastq, start=1): @@ -74,41 +109,46 @@ def create_links(lane, read, input_basedir, output_basedir, dry_run = False, und rel_path = os.path.relpath(input_file, output_dir) - print "Linking %s => %s" % (rel_path, output_file) + print("Linking %s => %s" % (rel_path, output_file)) if not dry_run and not os.path.exists(output_file): os.symlink(rel_path, output_file) - -def main(args = sys.argv): + + +def main(): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() if poptions.quiet: - logging.basicConfig(level=logging.WARNING, format=log_format) + logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT) elif poptions.debug: - logging.basicConfig(level=logging.DEBUG, format=log_format) + logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: # Set up the logging levels - logging.basicConfig(level=logging.INFO, format=log_format) + logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) input_dir = poptions.input_dir - p = json.loads(open(poptions.processing_file, 'r').read()) + data = json.loads(open(poptions.processing_file, "r").read()) - for lane in p['libraries']: + for lane in data["libraries"]: create_links(lane, "R1", input_dir, poptions.output_dir, poptions.dry_run) create_links(lane, "R2", input_dir, poptions.output_dir, poptions.dry_run) - undet_lane = {"alignments":[{"sample_name": "lane1_Undetermined_L001"}], "samplesheet_name": "Undetermined" } - for read in ['R1', 'R2']: - create_links(undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, True) + undet_lane = { + "alignments": [{"sample_name": "lane1_Undetermined_L001"}], + "samplesheet_name": "Undetermined", + } + for read in ["R1", "R2"]: + create_links( + undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, True + ) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it if __name__ == "__main__": main() - - From f7fc9c3cc40ff548a60ade261e0d4a334bf926f2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Jul 2023 13:58:17 -0700 Subject: [PATCH 066/172] fixups: extra fixes from testing --- scripts/flowcells/barcode_masks.py | 2 +- scripts/flowcells/max_mismatch.py | 10 +-- scripts/flowcells/setup.sh | 105 ++++++++++++++++------------- 3 files changed, 64 insertions(+), 53 deletions(-) diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py index bc76a9dd..f49bc422 100644 --- a/scripts/flowcells/barcode_masks.py +++ b/scripts/flowcells/barcode_masks.py @@ -115,7 +115,7 @@ def main(): json_data = json.load(process_json) process_json.close() - detect_collisions(json_data) + #detect_collisions(json_data) print(" ".join(get_barcode_masks(json_data))) diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index a5c3b4b6..3bda907e 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -25,6 +25,7 @@ def parser_setup(): parser.add_argument("-p", "--processing", dest="processing", help="The JSON file to read barcodes from") parser.add_argument("--ignore_failed_lanes", dest="ignore_failed_lanes", action="store_true", default=False, + help="Ignore failed lanes when calculating max mismatch.") parser.add_argument("--allow_collisions", dest="allow_collisions", action="store_true", default=False, help="Don't exit with error even if collisions are found (workaround)") @@ -84,7 +85,7 @@ def apply_mask(mask, barcode_string): barcodes = [ orig_barcodes[i][:l] for (i, l) in enumerate(mask) ] return barcodes -def create_lane_set(libraries, mask, ignore_failed_lanes): +def create_lane_set(libraries, mask, ignore_failed_lanes, allow_collision=False): lanes = {} for library in libraries: lane = library['lane'] @@ -97,8 +98,9 @@ def create_lane_set(libraries, mask, ignore_failed_lanes): if lane not in lanes: lanes[lane] = set() if barcodes in lanes[lane]: - sys.stderr.write("Collision on lane %d, barcode %s\n" % ( lane, ','.join(barcodes))) - sys.exit(1) + if not allow_collision: + sys.stderr.write("Collision on lane %d, barcode %s\n" % ( lane, ','.join(barcodes))) + sys.exit(1) lanes[lane].add(barcodes) return lanes @@ -126,7 +128,7 @@ def main(args = sys.argv): print("1") sys.exit(0) - lanes = create_lane_set(data['libraries'], mask, poptions.ignore_failed_lanes) + lanes = create_lane_set(data['libraries'], mask, poptions.ignore_failed_lanes, poptions.allow_collisions) mismatch_level = get_max_mismatch_level( lanes, len(mask) ) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 613bbd74..cc07289b 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -64,50 +64,6 @@ while getopts ":hvdxf:" opt ; do esac done -# Long command definitions -# The quadruple-backslash syntax on this is messy and gross. -# It works, though, and the output is readable. -# read -d '' always exits with status 1, so we ignore error -# We split threads equally between processing and loading+writing. -set +e -read -d '' regular_bcl_command << _REG_BCL_CMD_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --writing-threads 0 \\\\ - --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ - --processing-threads \\\$SLURM_CPUS_PER_TASK -_REG_BCL_CMD_ - -read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ - PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - for samplesheet in \$PWD/SampleSheet.withmask*csv ; do - bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) - fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask") - bcl2fastq \\\\ - --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ - --use-bases-mask "$bcl_mask" \\\\ - --output-dir "${illumina_dir}/\$fastq_dir" \\\\ - --barcode-mismatches "$mismatches" \\\\ - --output-dir "${illumina_dir}" \\\\ - --sample-sheet "${illumina_dir}/\$samplesheet" \\\\ - --writing-threads 0 \\\\ - --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ - --processing-threads \\\$SLURM_CPUS_PER_TASK - done -_NOVA_BCL_CMD_ - -read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' -for fq_dir in fastq* ; - [[ -d $fq_dir ]] || continue - python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_folder" -o Demultiplexed -p processing.json -done -_NOVA_LINK_CMD_ -set -e - if [ -z "$flowcell" ] ; then echo "No flowcell label specified" flowcell=$(basename "$PWD" | cut -f4 -d_ | cut -c2-10) @@ -223,8 +179,12 @@ mask=$( jq -r '.alignment_group.bases_mask' "$json" ) run_type=$( jq -r '.flowcell.run_type' "$json" ) has_umi=$( jq -r '.libraries | map(.barcode1.umi) | any' "$json") +# Novaseq runs always use native bcl2fastq demuxing +if [[ $run_type =~ Novaseq ]] ; then + unset demux +fi -# Check if read1length=0 -> that means alteseq +# Check if read1length=0 -> that means altseq # Handle specially # TODO: Check this from processing.json flowcell_data=$(lims_get_all "flowcell_run/?label=$flowcell") @@ -260,7 +220,7 @@ fi if [ -z "$demux" ] ; then bcl_mask=$mask - mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes) + mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes --allow_collisions) if [ "$has_umi" == "true" ] ; then echo "---WARNING---" echo "Flowcell contains UMI samples, but -d param was not specified" @@ -271,9 +231,59 @@ if [ -z "$demux" ] ; then else # Set some options for manual demultiplexing bcl_mask=$(tr Nn Ii <<< $mask) mismatches="0,0" - dmx_mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes --allow_collisions | cut -c1 ) + dmx_mismatches=$(python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes | cut -c1 ) fi +# Long command definitions +# The quadruple-backslash syntax on this is messy and gross. +# It works, though, and the output is readable. +# read -d '' always exits with status 1, so we ignore error +# We split threads equally between processing and loading+writing. +set +e +read -d '' regular_bcl_command << _REG_BCL_CMD_ + PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH + bcl2fastq \\\\ + --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ + --use-bases-mask "$bcl_mask" \\\\ + --output-dir "$fastq_dir" \\\\ + --barcode-mismatches "$mismatches" \\\\ + --writing-threads 0 \\\\ + --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ + --processing-threads \\\$SLURM_CPUS_PER_TASK +_REG_BCL_CMD_ + +read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ + PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH + for samplesheet in \$PWD/SampleSheet.withmask*csv ; do + bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) + fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask") + bcl2fastq \\\\ + --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ + --output-dir "${illumina_dir}/\$fastq_dir" \\\\ + --use-bases-mask "$bcl_mask" \\\\ + --barcode-mismatches "$mismatches" \\\\ + --sample-sheet "${illumina_dir}/\$samplesheet" \\\\ + --writing-threads 0 \\\\ + --loading-threads \\\$SLURM_CPUS_PER_TASK \\\\ + --processing-threads \\\$SLURM_CPUS_PER_TASK + done +_NOVA_BCL_CMD_ + +read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' +for fq_dir in fastq-withmask-* ; + [[ -d $fq_dir ]] || continue + python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_dir" -o Demultiplexed -p processing.json +done +_NOVA_LINK_CMD_ +set -e + +if [ -z "$flowcell" ] ; then + echo "No flowcell label specified" + flowcell=$(basename "$PWD" | cut -f4 -d_ | cut -c2-10) + echo "Guessing $flowcell..." +fi + + case $run_type in "Novaseq 6000 S1") @@ -724,7 +734,6 @@ sbatch --export=ALL -J "collate-$flowcell" \$copy_dependency -o "collate-$flowce source "$STAMPIPES/scripts/sentry/sentry-lib.bash" cd "$analysis_dir" -$link_command # Remove existing scripts if they exist (to avoid appending) rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash From 0c0abc7414fe5be184d65eda324a2c006c187d37 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 26 Jul 2023 16:24:42 -0700 Subject: [PATCH 067/172] Altcode - add scanpy file gen. Uses apptainer --- processes/altcode/Makefile | 8 ++ processes/altcode/altcode.nf | 42 ++++-- processes/altcode/bin/STAR | 1 + processes/altcode/bin/compress_mtx_files.sh | 33 +++++ processes/altcode/bin/mtx_to_h5.py | 32 +++++ processes/altcode/nextflow.config | 16 +++ processes/altcode/scanpy.apptainer.def | 14 ++ processes/altcode/scanpy.environment.yaml | 135 ++++++++++++++++++++ 8 files changed, 273 insertions(+), 8 deletions(-) create mode 100644 processes/altcode/Makefile create mode 120000 processes/altcode/bin/STAR create mode 100755 processes/altcode/bin/compress_mtx_files.sh create mode 100755 processes/altcode/bin/mtx_to_h5.py create mode 100644 processes/altcode/nextflow.config create mode 100644 processes/altcode/scanpy.apptainer.def create mode 100644 processes/altcode/scanpy.environment.yaml diff --git a/processes/altcode/Makefile b/processes/altcode/Makefile new file mode 100644 index 00000000..1f4a7856 --- /dev/null +++ b/processes/altcode/Makefile @@ -0,0 +1,8 @@ +.PHONY: all +all: containers + +.PHONY: containers +containers: scanpy.sif + +%.sif: %.apptainer.def %.environment.yaml + apptainer build --force "$@" "$<" diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 77ac9182..548ecf02 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -25,6 +25,9 @@ workflow { ], ) + STAR_solo.out.solo_analysis + | convert_to_hda5 + } // Helper functions @@ -48,9 +51,9 @@ def pos_to_str(start, length) { /// This process creates the Aligned.out.cram file and STARsolo analysis results process STAR_solo { - module 'STAR/2.7.9a' publishDir params.outdir cpus 10 + memory "50 GB" input: tuple( @@ -69,7 +72,6 @@ process STAR_solo { script: - // TODO: How do we dynamically determine this? // barcode_positions = "0_10_0_17 0_48_0_55 0_78_0_85" bc1_position = pos_to_str(*r1_barcode_pos) bc2_position = pos_to_str(*r2_barcode_pos) @@ -85,9 +87,10 @@ process STAR_solo { num_threads = 10 """ - set -e + set -o monitor mkfifo Aligned.out.bam - STAR \ + + (STAR \ --genomeDir "ref" \ --readFilesIn "${r1_files}" "${r2_files}" \ --soloType CB_UMI_Complex \ @@ -96,16 +99,16 @@ process STAR_solo { --soloUMIposition "${umi_position}" \ --soloCBmatchWLtype 1MM \ --soloUMIdedup 1MM_All \ - --soloFeatures Gene GeneFull SJ \ + --soloFeatures Gene GeneFull SJ GeneFull_Ex50pAS GeneFull_ExonOverIntron \ --runThreadN "${num_threads}" \ --limitBAMsortRAM "${bam_sort_RAM}" \ --outSAMtype BAM Unsorted \ --outSAMattributes NH HI AS nM CR CY UR UY sM \ --outBAMcompression 0 \ - - outBAMsortingThreadN "${num_threads}" \ + --outBAMsortingThreadN "${num_threads}" \ --readFilesCommand zcat \ --outFileNamePrefix ./ \ - --limitOutSJcollapsed 5000000 & + --limitOutSJcollapsed 5000000 || kill 0) & samtools sort \ --reference "${genome_fasta}" \ @@ -114,9 +117,32 @@ process STAR_solo { --threads "${num_threads}" \ --write-index \ -T "tmpsort" \ - Aligned.out.bam & + Aligned.out.bam wait rm Aligned.out.bam + compress_mtx_files.sh ./Solo.out "${num_threads}" """ } + +process convert_to_hda5 { + cpus 10 + memory "10 GB" + publishDir params.outdir + + input: + tuple(val(meta), path(directory)) + + output: + tuple(val(meta), path(directory)) + + shell: + ''' + for dir_name in $(find -L "!{directory}" -name matrix.mtx.gz \ + | grep -v "SJ/raw" \ + | xargs --no-run-if-empty dirname) ; do + mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" & + done + wait + ''' +} diff --git a/processes/altcode/bin/STAR b/processes/altcode/bin/STAR new file mode 120000 index 00000000..812cae62 --- /dev/null +++ b/processes/altcode/bin/STAR @@ -0,0 +1 @@ +../../../third_party/STAR \ No newline at end of file diff --git a/processes/altcode/bin/compress_mtx_files.sh b/processes/altcode/bin/compress_mtx_files.sh new file mode 100755 index 00000000..d71b194f --- /dev/null +++ b/processes/altcode/bin/compress_mtx_files.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# This script finds and gzips all StarSOLO matrix/barcodes/features files. +root_directory=${1:-.} +threads=${2:-10} + +#shellcheck disable=SC2037 +name_query=( '(' -name matrix.mtx -o -name barcodes.tsv -o -name features.tsv ')' ) + +# Gzip regular files +find "$root_directory" -type f \ + "${name_query[@]}" \ + -print0 \ + | xargs --no-run-if-empty -0 -n 1 -P "$threads" gzip + +# Gzip any targets pointed to by a symlink +find "$root_directory" -type l \ + "${name_query[@]}" \ + -print0 \ + | xargs -0 -n1 readlink -f \ + | sort -u \ + | xargs --no-run-if-empty -n 1 -P "$threads" gzip + +# Create new symlinks that point to the new target +find "$root_directory" -type l \ + "${name_query[@]}" \ + | while read -r symlink ; do + target=$(readlink "$symlink") + realtarget=$(readlink -f "$symlink") + if [[ -e "$realtarget.gz" ]] ; then + ln -s "$target.gz" "$symlink.gz" + rm "$symlink" + fi +done diff --git a/processes/altcode/bin/mtx_to_h5.py b/processes/altcode/bin/mtx_to_h5.py new file mode 100755 index 00000000..728b4c06 --- /dev/null +++ b/processes/altcode/bin/mtx_to_h5.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import argparse +import logging + +import scanpy as sc + + +def parser_setup(): + parser = argparse.ArgumentParser() + parser.add_argument("mtx_directory", help="the directory containing the mtx files") + parser.add_argument("output", help="the name of the output file") + parser.add_argument("--compress", action="store_true", + help="Compress output with gzip") + return parser + + +def convert(input_dir, output_file, compress=False): + data = sc.read_10x_mtx(input_dir, cache=False) + comp_method = "gzip" if compress else None + data.write(filename=output_file, compression=comp_method) + +def main(): + poptions = parser_setup().parse_args() + if not poptions.output.endswith("h5ad"): + logging.warning( + "output file extension is not '.h5ad', some programs may fail to read it" + ) + convert(poptions.mtx_directory, poptions.output, poptions.compress) + +if __name__ == "__main__": + main() diff --git a/processes/altcode/nextflow.config b/processes/altcode/nextflow.config new file mode 100644 index 00000000..e94004d6 --- /dev/null +++ b/processes/altcode/nextflow.config @@ -0,0 +1,16 @@ +includeConfig "../../nextflow.config" + +process { + withName: STAR_solo { + module = [ 'samtools/1.14' ] + } + withName: convert_to_hda5 { + container = "file://${baseDir}/scanpy.sif" + } +} + +apptainer { + enabled = true + cacheDir = "$HOME/.apptainer_nextflow_cache" + runOptions = "--env PREPEND_PATH=/opt/conda/bin:/opt/conda/condabin" +} diff --git a/processes/altcode/scanpy.apptainer.def b/processes/altcode/scanpy.apptainer.def new file mode 100644 index 00000000..1fa5aaa4 --- /dev/null +++ b/processes/altcode/scanpy.apptainer.def @@ -0,0 +1,14 @@ +Bootstrap: docker +From: mambaorg/micromamba:1.4.9 + +# %setup +# cat /home/nelsonjs/code/stampipes-altcode-dev/processes/altcode/scanpy.environment.yml > ${APPTAINER_ROOTFS}/env.yaml + +%files + $STAMPIPES/processes/altcode/scanpy.environment.yaml /env.yaml + +%post + micromamba install --quiet -y -n base -f /env.yaml && micromamba clean --all --yes + eval "$(micromamba shell hook --shell dash)" + micromamba activate + echo "export PATH=$PATH" >> $APPTAINER_ENVIRONMENT diff --git a/processes/altcode/scanpy.environment.yaml b/processes/altcode/scanpy.environment.yaml new file mode 100644 index 00000000..e054dd2e --- /dev/null +++ b/processes/altcode/scanpy.environment.yaml @@ -0,0 +1,135 @@ +name: base +channels: +- conda-forge +dependencies: +- _libgcc_mutex=0.1=conda_forge +- _openmp_mutex=4.5=2_gnu +- anndata=0.9.2=pyhd8ed1ab_0 +- arpack=3.7.0=hdefa2d7_2 +- brotli=1.0.9=h166bdaf_9 +- brotli-bin=1.0.9=h166bdaf_9 +- brotli-python=1.0.9=py311ha362b79_9 +- bzip2=1.0.8=h7f98852_4 +- c-ares=1.19.1=hd590300_0 +- ca-certificates=2023.7.22=hbcca054_0 +- cached-property=1.5.2=hd8ed1ab_1 +- cached_property=1.5.2=pyha770c72_1 +- certifi=2023.7.22=pyhd8ed1ab_0 +- charset-normalizer=3.2.0=pyhd8ed1ab_0 +- colorama=0.4.6=pyhd8ed1ab_0 +- contourpy=1.1.0=py311h9547e67_0 +- cycler=0.11.0=pyhd8ed1ab_0 +- fonttools=4.41.1=py311h459d7ec_0 +- freetype=2.12.1=hca18f0e_1 +- glpk=5.0=h445213a_0 +- gmp=6.2.1=h58526e2_0 +- h5py=3.9.0=nompi_py311he78b9b8_101 +- hdf5=1.14.1=nompi_h4f84152_100 +- icu=72.1=hcb278e6_0 +- idna=3.4=pyhd8ed1ab_0 +- igraph=0.10.6=h97b68dd_0 +- importlib-metadata=6.8.0=pyha770c72_0 +- importlib_metadata=6.8.0=hd8ed1ab_0 +- joblib=1.3.0=pyhd8ed1ab_1 +- keyutils=1.6.1=h166bdaf_0 +- kiwisolver=1.4.4=py311h4dd048b_1 +- krb5=1.21.1=h659d440_0 +- lcms2=2.15=haa2dc70_1 +- ld_impl_linux-64=2.40=h41732ed_0 +- leidenalg=0.10.1=py311hb755f60_0 +- lerc=4.0.0=h27087fc_0 +- libaec=1.0.6=hcb278e6_1 +- libblas=3.9.0=17_linux64_openblas +- libbrotlicommon=1.0.9=h166bdaf_9 +- libbrotlidec=1.0.9=h166bdaf_9 +- libbrotlienc=1.0.9=h166bdaf_9 +- libcblas=3.9.0=17_linux64_openblas +- libcurl=8.2.0=hca28451_0 +- libdeflate=1.18=h0b41bf4_0 +- libedit=3.1.20191231=he28a2e2_2 +- libev=4.33=h516909a_1 +- libexpat=2.5.0=hcb278e6_1 +- libffi=3.4.2=h7f98852_5 +- libgcc-ng=13.1.0=he5830b7_0 +- libgfortran-ng=13.1.0=h69a702a_0 +- libgfortran5=13.1.0=h15d22d2_0 +- libgomp=13.1.0=he5830b7_0 +- libhwloc=2.9.1=nocuda_h7313eea_6 +- libiconv=1.17=h166bdaf_0 +- libjpeg-turbo=2.1.5.1=h0b41bf4_0 +- liblapack=3.9.0=17_linux64_openblas +- libleidenalg=0.11.1=h00ab1b0_0 +- libllvm14=14.0.6=hcd5def8_3 +- libnghttp2=1.52.0=h61bc06f_0 +- libnsl=2.0.0=h7f98852_0 +- libopenblas=0.3.23=pthreads_h80387f5_0 +- libpng=1.6.39=h753d276_0 +- libsqlite=3.42.0=h2797004_0 +- libssh2=1.11.0=h0841786_0 +- libstdcxx-ng=13.1.0=hfd8a6a1_0 +- libtiff=4.5.1=h8b53f26_0 +- libuuid=2.38.1=h0b41bf4_0 +- libwebp-base=1.3.1=hd590300_0 +- libxcb=1.15=h0b41bf4_0 +- libxml2=2.11.4=h0d562d8_0 +- libzlib=1.2.13=hd590300_5 +- llvmlite=0.40.1=py311ha6695c7_0 +- matplotlib-base=3.7.2=py311h54ef318_0 +- metis=5.1.1=h59595ed_0 +- mpfr=4.2.0=hb012696_0 +- munkres=1.1.4=pyh9f0ad1d_0 +- natsort=8.4.0=pyhd8ed1ab_0 +- ncurses=6.4=hcb278e6_0 +- networkx=3.1=pyhd8ed1ab_0 +- numba=0.57.1=py311h96b013e_0 +- numpy=1.24.4=py311h64a7726_0 +- openjpeg=2.5.0=hfec8fc6_2 +- openssl=3.1.1=hd590300_1 +- packaging=23.1=pyhd8ed1ab_0 +- pandas=2.0.3=py311h320fe9a_1 +- patsy=0.5.3=pyhd8ed1ab_0 +- pillow=10.0.0=py311h0b84326_0 +- pip=23.2.1=pyhd8ed1ab_0 +- platformdirs=3.9.1=pyhd8ed1ab_0 +- pooch=1.7.0=pyha770c72_3 +- procps-ng=4.0.3=h8228510_0 +- pthread-stubs=0.4=h36c2ea0_1001 +- pynndescent=0.5.10=pyh1a96a4e_0 +- pyparsing=3.0.9=pyhd8ed1ab_0 +- pysocks=1.7.1=pyha2e5f31_6 +- python=3.11.4=hab00c5b_0_cpython +- python-dateutil=2.8.2=pyhd8ed1ab_0 +- python-igraph=0.10.6=py311h4b1723a_0 +- python-tzdata=2023.3=pyhd8ed1ab_0 +- python_abi=3.11=3_cp311 +- pytz=2023.3=pyhd8ed1ab_0 +- readline=8.2=h8228510_1 +- requests=2.31.0=pyhd8ed1ab_0 +- scanpy=1.9.3=pyhd8ed1ab_0 +- scikit-learn=1.3.0=py311hc009520_0 +- scipy=1.11.1=py311h64a7726_0 +- seaborn=0.12.2=hd8ed1ab_0 +- seaborn-base=0.12.2=pyhd8ed1ab_0 +- session-info=1.0.0=pyhd8ed1ab_0 +- setuptools=68.0.0=pyhd8ed1ab_0 +- six=1.16.0=pyh6c4a22f_0 +- statsmodels=0.14.0=py311h1f0f07a_1 +- stdlib-list=0.8.0=pyhd8ed1ab_0 +- suitesparse=5.10.1=h9e50725_1 +- tbb=2021.9.0=hf52228f_0 +- texttable=1.6.7=pyhd8ed1ab_0 +- threadpoolctl=3.2.0=pyha21a80b_0 +- tk=8.6.12=h27826a3_0 +- tqdm=4.65.0=pyhd8ed1ab_1 +- typing-extensions=4.7.1=hd8ed1ab_0 +- typing_extensions=4.7.1=pyha770c72_0 +- tzdata=2023c=h71feb2d_0 +- umap-learn=0.5.3=py311h38be061_1 +- urllib3=2.0.4=pyhd8ed1ab_0 +- wheel=0.41.0=pyhd8ed1ab_0 +- xorg-libxau=1.0.11=hd590300_0 +- xorg-libxdmcp=1.1.3=h7f98852_0 +- xz=5.2.6=h166bdaf_0 +- zipp=3.16.2=pyhd8ed1ab_0 +- zstd=1.5.2=hfc55251_7 + From 519aa894179fda8722a36e1f9ea0a3d5a155e7f4 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 27 Jul 2023 16:31:57 -0700 Subject: [PATCH 068/172] Add process_altcode.bash --- processes/altcode/process_altcode.bash | 78 ++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 processes/altcode/process_altcode.bash diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash new file mode 100755 index 00000000..48373c57 --- /dev/null +++ b/processes/altcode/process_altcode.bash @@ -0,0 +1,78 @@ +#!/bin/bash +set -euo pipefail + +version=1.0.0-alpha1 +cd "$(dirname "$0")" + +# Temporarily hardcoded! +R1_BARCODE_POS=78 +R2_BARCODE_POS=48 +R3_BARCODE_POS=10 +R1_BARCODE_LEN=8 +R2_BARCODE_LEN=8 +R3_BARCODE_LEN=8 + +outdir="output_$version" +status_file="$outdir/status.json" + +# TODO: improve REDO_ALIGNMENT handling - should we be manually removing the work dir? +if [[ -e "$status_file" && -z "$REDO_ALIGNMENT" ]] ; then + # Check to see if the alignment is complete + if jq -e '.completed_on' "$status_file" ; then + echo "Processing already completed, exiting." + echo "To force re-run, set the env var 'REDO_ALIGNMENT=True' or remove $status_file" + exit 0 + fi +fi + +# Dependencies +source "$MODULELOAD" +module purge +module load jdk/11.0.16 +module load nextflow/22.04.3 +module load python/3.5.1 +module load apptainer/1.1.2 + +export NXF_VER=23.04.2 + +source "$PYTHON3_ACTIVATE" +source "$STAMPIPES/scripts/sentry/sentry-lib.bash" + +WORKROOT=${WORKROOT:-/net/seq/scratch} +if ! [[ -d "$WORKROOT" ]] ; then + echo "WORKROOT '$WORKROOT' does not exist, using '$PWD'" + WORKROOT=$PWD +fi +WORKDIR=$WORKROOT/$USER/altseq/FC$FLOWCELL/work/ + + +# Write parameter file +params=params.yaml +cat >$params < Date: Thu, 27 Jul 2023 17:02:31 -0700 Subject: [PATCH 069/172] altcode config: remove unneeded runOptions --- processes/altcode/nextflow.config | 1 - 1 file changed, 1 deletion(-) diff --git a/processes/altcode/nextflow.config b/processes/altcode/nextflow.config index e94004d6..32d6ada9 100644 --- a/processes/altcode/nextflow.config +++ b/processes/altcode/nextflow.config @@ -12,5 +12,4 @@ process { apptainer { enabled = true cacheDir = "$HOME/.apptainer_nextflow_cache" - runOptions = "--env PREPEND_PATH=/opt/conda/bin:/opt/conda/condabin" } From 41f26ff3b01aeea640036e6bc502931c1cb2b13a Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 27 Jul 2023 17:03:05 -0700 Subject: [PATCH 070/172] improve convert_to_hda5 error detection --- processes/altcode/altcode.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 548ecf02..6387430a 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -51,7 +51,7 @@ def pos_to_str(start, length) { /// This process creates the Aligned.out.cram file and STARsolo analysis results process STAR_solo { - publishDir params.outdir + publishDir params.outdir, mode: "copy" cpus 10 memory "50 GB" @@ -128,7 +128,7 @@ process STAR_solo { process convert_to_hda5 { cpus 10 memory "10 GB" - publishDir params.outdir + publishDir params.outdir, mode: "copy" input: tuple(val(meta), path(directory)) @@ -138,10 +138,11 @@ process convert_to_hda5 { shell: ''' + set -m for dir_name in $(find -L "!{directory}" -name matrix.mtx.gz \ | grep -v "SJ/raw" \ | xargs --no-run-if-empty dirname) ; do - mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" & + (mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" || kill 0 ) & done wait ''' From dbd0fad0bbf88b0b2246b1f3a29b95ebda65a353 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 30 Jul 2023 12:34:24 -0700 Subject: [PATCH 071/172] Add 'scratch false' for jobs that don't need /tmp/ --- modules/adapter_trimming.nf | 3 +++ modules/cram.nf | 2 ++ modules/umt.nf | 2 ++ processes/bwa/process_bwa_paired_trimmed.nf | 14 +++++++++++++- .../aggregation/cufflinks_featurecounts.nf | 8 ++++++++ processes/rna-star/nextflow.config | 12 ++++++++++++ 6 files changed, 40 insertions(+), 1 deletion(-) diff --git a/modules/adapter_trimming.nf b/modules/adapter_trimming.nf index e0cef1c9..d85c3aae 100644 --- a/modules/adapter_trimming.nf +++ b/modules/adapter_trimming.nf @@ -6,6 +6,7 @@ process fastp_adapter_trim { // 0.23.0 enables deterministic results, which is crucial module 'fastp/0.21.0' cpus 3 + scratch false input: tuple path(r1), path(r2), val(adapterP5), val(adapterP7) @@ -33,6 +34,8 @@ process fastp_adapter_trim { /// Our custom in-house adapter-trimming script process adapter_trim { + cpus 3 + scratch false input: tuple path(r1), path(r2), val(adapterP5), val(adapterP7) diff --git a/modules/cram.nf b/modules/cram.nf index cacadf9d..2fcdffc7 100644 --- a/modules/cram.nf +++ b/modules/cram.nf @@ -25,6 +25,7 @@ process encode_cram { container "quay.io/biocontainers/samtools:1.12--h9aed4be_1" cpus Math.ceil(params.cram_compression_threads / 2) + scratch false input: tuple val(meta), path(input_bam), path(reference) @@ -134,6 +135,7 @@ process encode_cram_no_ref { container "quay.io/biocontainers/samtools:1.12--h9aed4be_1" cpus Math.ceil(params.cram_compression_threads / 2) + scratch false input: tuple val(meta), path(input_bam) diff --git a/modules/umt.nf b/modules/umt.nf index c3a8ba58..53a9d64f 100644 --- a/modules/umt.nf +++ b/modules/umt.nf @@ -2,6 +2,7 @@ /// A generic script to move the UMT from a read name to the RX tag in a BAM file process move_umt { + scratch false input: path(input_bam) output: @@ -18,6 +19,7 @@ process move_umt { /// UMT-trimming for Takara Pico v3 kits process takara_trim_umt { + scratch false input: tuple path("in.r1.fq.gz"), path("in.r2.fq.gz") val readlength diff --git a/processes/bwa/process_bwa_paired_trimmed.nf b/processes/bwa/process_bwa_paired_trimmed.nf index 6cf3a4dd..2dbf2271 100755 --- a/processes/bwa/process_bwa_paired_trimmed.nf +++ b/processes/bwa/process_bwa_paired_trimmed.nf @@ -92,6 +92,7 @@ process split_r2_fastq { process trim_adapters { cpus params.threads + scratch false input: file split_r1 @@ -125,6 +126,7 @@ process trim_adapters { */ process trim_to_length { + scratch false input: set file(r1), file(r2) from trimmed @@ -148,6 +150,7 @@ process trim_to_length { process add_umi_info { + scratch false input: set file(r1), file(r2) from trimmed_fastq @@ -183,6 +186,7 @@ process add_umi_info { */ process fastq_counts { + scratch false input: file(r1) from file(params.r1) file(r2) from file(params.r2) @@ -253,6 +257,8 @@ process align { */ process filter_bam { + scratch false + input: file unfiltered_bam file nuclear_chroms from file(nuclear_chroms) @@ -275,6 +281,7 @@ process filter_bam { process sort_bam { cpus params.threads + scratch false input: file filtered_bam @@ -285,7 +292,7 @@ process sort_bam { script: """ samtools sort \ - -l 0 -m 1G -@ "${params.threads}" "$filtered_bam" \ + -l 0 -m 2G -@ "${params.threads}" "$filtered_bam" \ > sorted.bam """ } @@ -294,6 +301,8 @@ process sort_bam { * Step 3: Merge alignments into one big ol' file */ process merge_bam { + scratch false + input: file 'sorted_bam_*' from sorted_bam.collect() @@ -359,6 +368,7 @@ if (params.UMI) process filter_bam_to_unique { + scratch false input: file marked_bam @@ -381,6 +391,7 @@ uniquely_mapping_bam.into { bam_for_insert; bam_for_spot; bam_for_density } */ process bam_counts { + scratch false input: file(sorted_bam) from marked_bam_for_counts @@ -561,6 +572,7 @@ process total_counts { process cram { publishDir params.outdir cpus params.cramthreads / 2 + scratch false // TODO: put in config module "samtools/1.12" diff --git a/processes/rna-star/aggregation/cufflinks_featurecounts.nf b/processes/rna-star/aggregation/cufflinks_featurecounts.nf index 0f82e94a..6902e561 100644 --- a/processes/rna-star/aggregation/cufflinks_featurecounts.nf +++ b/processes/rna-star/aggregation/cufflinks_featurecounts.nf @@ -91,6 +91,7 @@ workflow RNA_AGG { process merge_transcriptome_bam { module "samtools/1.12" + scratch false input: // Assume sorted by coord file("in*.bam") @@ -111,6 +112,7 @@ process merge_transcriptome_bam { process merge_genome_bam { module "samtools/1.12" + scratch false input: // Assume sorted by coord file("in*.bam") @@ -169,6 +171,7 @@ process mark_duplicate_reads { label 'high_mem' module "jdk/2.8.1", "picard/2.8.1", "samtools/1.12" + scratch false input: path genomebam @@ -194,6 +197,9 @@ process mark_duplicate_reads { process bam_to_fastq { publishDir params.outdir, mode: params.publishmode module "samtools/1.12" + + cpus 3 + scratch false input: path input_bam @@ -412,6 +418,8 @@ process anaquin { module "samtools/1.12", "anaquin/2.0.1", "kallisto/0.43.1", "R/3.2.5" publishDir params.outdir, mode: params.publishmode + errorStrategy { task.exitStatus == 143 ? 'retry' : 'ignore' } + input: path input_bam path sequins_ref diff --git a/processes/rna-star/nextflow.config b/processes/rna-star/nextflow.config index 3e36005a..e375febf 100644 --- a/processes/rna-star/nextflow.config +++ b/processes/rna-star/nextflow.config @@ -8,4 +8,16 @@ profiles { starIndexDir = "/net/seq/data/genomes/human/GRCh38/noalts/STARgenome-gencode-v25/" } } + + + // Uncomment this to send STAR alignments to the 'bigmem' queue and give them extra memory + // cluster { + // process { + // withName: star { + // queue = 'bigmem' + // memory = { 96.GB * (2**(task.attempt - 1)) } + // scratch = false + // } + // } + // } } From 331479f533a0818fdc71f74b3ae64854d2db2518 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 30 Jul 2023 12:57:10 -0700 Subject: [PATCH 072/172] Make sure altseq runs on hpcz-2 --- scripts/flowcells/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index cc07289b..d5db7ab5 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -205,8 +205,8 @@ if [[ "$read1length" = "0" ]] ; then cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash sbatch --cpus 1 \ - --mem '2G' \ - --partition queue0 \ + --mem '4G' \ + --partition hpcz-2 \ --job-name "altseq-$flowcell-supervisor" < Date: Sun, 30 Jul 2023 13:24:15 -0700 Subject: [PATCH 073/172] Fix ref for altseq - was live but not committed --- processes/altseq/process_altseq.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 7fbc0040..04620672 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -47,8 +47,8 @@ python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) -GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ -GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified +GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome/ +GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/refdata-gex-GRCh38-2020-A/fasta/genome.fa BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt WORKROOT=${WORKROOT:-/net/seq/scratch} From 4680d91ad7c134fd1aba4128a51a3708477fbc65 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 30 Jul 2023 14:13:47 -0700 Subject: [PATCH 074/172] Various accumulated fixes from production --- modules/utility.nf | 5 +++- processes/bwa/aggregate/nextflow.config | 26 +++++++++++++++++++ processes/bwa/nextflow.config | 6 ++--- .../rna-star/aggregation/nextflow.config | 13 ++++++++++ processes/rna-star/modules/star.nf | 10 ++++--- scripts/flowcells/link_nextseq.py | 3 +++ scripts/lims/upload_data.py | 14 +++++----- 7 files changed, 62 insertions(+), 15 deletions(-) diff --git a/modules/utility.nf b/modules/utility.nf index b2398097..2fda9c31 100644 --- a/modules/utility.nf +++ b/modules/utility.nf @@ -1,12 +1,13 @@ /// This file is only for "utility" processes that are extremely generic. params.outdir = "output" -params.publishmode = "copy" +params.publishmode = "link" process publish_and_rename { publishDir params.outdir, mode: params.publishmode executor "local" + container null input: tuple val(filename), path("__infile__") @@ -21,6 +22,7 @@ process publish_and_rename { process publish { publishDir params.outdir, mode: params.publishmode + container null executor "local" @@ -38,6 +40,7 @@ process publish { process publish_with_meta { publishDir params.outdir, mode: params.publishmode executor "local" + container null input: tuple val(meta), path(filename) diff --git a/processes/bwa/aggregate/nextflow.config b/processes/bwa/aggregate/nextflow.config index c5694d45..c4bdca3e 100644 --- a/processes/bwa/aggregate/nextflow.config +++ b/processes/bwa/aggregate/nextflow.config @@ -1,6 +1,8 @@ includeConfig '../../../nextflow.config' conda.cacheDir = "${baseDir}/../../../environments" + + process { withLabel: "footprints" { process.container = "fwip/ftd" @@ -35,6 +37,29 @@ profiles { } } + cluster { + process { + // Super temporary! + // errorStrategy = 'retry' + + withName: 'density|multimapping_density|cutcounts|insert_sizes' { + // Density process sometimes reports OOM with a 255 exit code + errorStrategy = { task.exitStatus in [1, 137, 143, 255] ? 'retry' : 'terminate' } + + // Temporary, for some specific aggs + memory = { 96.GB * (2**(task.attempt - 1)) } + //queue = 'bigmem' + } + // process { + // withName: insert_sizes { + // scratch = false + // } + // } + + + } + } + test { params { genome = "$baseDir/../../../test_data/ref/chr22.fa" @@ -64,5 +89,6 @@ profiles { file = "dag.html" } } + } diff --git a/processes/bwa/nextflow.config b/processes/bwa/nextflow.config index 24526f6c..cdef24e2 100644 --- a/processes/bwa/nextflow.config +++ b/processes/bwa/nextflow.config @@ -13,9 +13,9 @@ profiles { withName: 'filter_bam' { module = 'samtools/1.3:python/3.5.1:pysam/0.9.0' } - withName: 'sort_bam' { module = 'samtools/1.3' } - withName: 'merge_bam' { module = 'samtools/1.3' } - withName: 'filter_bam_to_unique' { module = 'samtools/1.3' } + withName: 'sort_bam' { module = 'samtools/1.12' } + withName: 'merge_bam' { module = 'samtools/1.12' } + withName: 'filter_bam_to_unique' { module = 'samtools/1.12' } withName: 'mark_duplicates' { module = 'jdk/1.8.0_92:picard/2.8.1:samtools/1.3' } diff --git a/processes/rna-star/aggregation/nextflow.config b/processes/rna-star/aggregation/nextflow.config index da39d61b..e7280e18 100644 --- a/processes/rna-star/aggregation/nextflow.config +++ b/processes/rna-star/aggregation/nextflow.config @@ -25,4 +25,17 @@ profiles { outdir = "test-cufflinks-agg" } } + + + // Uncomment this for giant aggregations which would overflow /tmp + // cluster { + // process { + // withName: ribosomal_count { + // scratch = false + // } + // withName: density { + // scratch = false + // } + // } + // } } diff --git a/processes/rna-star/modules/star.nf b/processes/rna-star/modules/star.nf index 2f285b7b..17025cb8 100644 --- a/processes/rna-star/modules/star.nf +++ b/processes/rna-star/modules/star.nf @@ -10,6 +10,7 @@ process star { publishDir params.outdir, enabled: params.publish label 'high_mem' + //memory "256 GB" input: tuple path(r1_fq), path(r2_fq) @@ -21,9 +22,10 @@ process star { script: mode = "str_PE" threads = params.star_threads - // Ten gigabyte fastq files will need more RAM to sort - is_giant = r1_fq.size() > 10_000_000_000 - sort_ram = is_giant ? 60_000_000_000 : 30_000_000_000 + // Four-gigabyte fastq files will need more RAM to sort + is_giant = r1_fq.size() > 3_000_000_000 + sort_ram = is_giant ? 200_000_000_000 : 30_000_000_000 + sj_limit = is_giant ? "--limitOutSJcollapsed 5000000" : "" """ # TODO: Update this?? echo -e '@CO\tANNID:gencode.basic.tRNA.annotation.gtf.gz' > commentslong.txt @@ -47,7 +49,7 @@ process star { --limitBAMsortRAM ${sort_ram} \ --outSAMtype BAM SortedByCoordinate \ --quantMode TranscriptomeSAM \ - --outSAMheaderCommentFile commentslong.txt \ + --outSAMheaderCommentFile commentslong.txt ${sj_limit}\ --outSAMheaderHD '@HD' 'VN:1.4' 'SO:coordinate' # TODO: add stranded options """ diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index d88ed23a..2792e519 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -73,6 +73,9 @@ def create_links( If dry_run is passed, will print them instead of creating them """ + # Skip processing the lane if it's not getting aligned + if not lane.get("alignments"): + return False sample_name = lane["alignments"][0]["sample_name"] short_name = lane["samplesheet_name"] diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 0945b98a..cfcdf21e 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -479,13 +479,13 @@ def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose file_size = os.path.getsize(path) last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) - #if exists: - #recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) - - # TODO: Make time-checking work! - # Current issue: sub-second precision. - if skip_md5_check and exists and exists["size_bytes"] == file_size :#and last_modified == recorded_mtime: - log.info("File exists and matches recorded size, skipping %s" % path) + if skip_md5_check and exists and exists["size_bytes"] == file_size: + recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( + exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) + # Allow for sloppiness in NFS timestamps + difference = recorded_mtime - last_modified + if timedelta(minutes=-1) <= difference <= timedelta(minutes=1): + log.info("File exists and matches recorded size, skipping %s" % path) return md5sum = md5sum_file(path) From 36d35fb54d8bc3a99180173da3bacb8e97d9dc1f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 30 Jul 2023 16:33:13 -0700 Subject: [PATCH 075/172] Add upload script --- processes/altcode/process_altcode.bash | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 48373c57..bd3b296c 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -76,3 +76,26 @@ nextflow run \ -workdir "$WORKDIR" \ -profile cluster \ -resume + +## Upload fastq metadata +#python "$STAMPIPES/scripts/altseq/upload_data.py" \ + #"$sample_config" \ + #processing.json \ + #--output_file_directory "$outdir" + +# Create sentinel/status file +if [[ -e "$status_file" ]] ; then + old_date=$(jq .completed_on <<< "$status_file") + old_status_file=${status_file/json/$old_date}.json + mv "$status_file" "$old_status_file" +fi + +# TODO: What else do we want to capture here? It would be nice to at least +# capture the command used and relevant env vars +echo | jq . > "$status_file" < Date: Sun, 30 Jul 2023 16:55:34 -0700 Subject: [PATCH 076/172] setup.sh fixes --- scripts/flowcells/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index d5db7ab5..b2d78dd3 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -254,13 +254,13 @@ _REG_BCL_CMD_ read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - for samplesheet in \$PWD/SampleSheet.withmask*csv ; do + for samplesheet in SampleSheet.withmask*csv ; do bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask") bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "${illumina_dir}/\$fastq_dir" \\\\ - --use-bases-mask "$bcl_mask" \\\\ + --use-bases-mask "\$bcl_mask" \\\\ --barcode-mismatches "$mismatches" \\\\ --sample-sheet "${illumina_dir}/\$samplesheet" \\\\ --writing-threads 0 \\\\ From b1f1e65dc987519004777177fc9794d472f2fdee Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Aug 2023 16:13:58 -0700 Subject: [PATCH 077/172] Bunch of changes to support LibraryPool dirs --- scripts/apilaneprocess.py | 22 +++++++++++++++++++++- scripts/flowcells/link_nextseq.py | 18 +++++++++++++++--- scripts/flowcells/setup.sh | 8 +++++++- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index c6a955ca..0abee608 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -3,6 +3,7 @@ import sys import argparse import logging +import re import requests import collections @@ -149,6 +150,19 @@ def setup_lane(self, lane_id): processing_info = self.get_lane_process_info(lane_id) + pool_name = None + try: + lib_number = processing_info["libraries"][0]["library"] + library_info = self.api.get_single_result(url_addition="library/?number=%d/" % lib_number) + pools = library_info["librarypools"] + if pools: + pool_name = pools[0]["object_name"] + logging.debug("Setting up script for pool %s", pool_name) + except: + pass + + # Check if in pool + self.create_script(processing_info) def add_script(self, script_file, lane_id, flowcell_label, sample_name): @@ -169,7 +183,7 @@ def get_script_template(self): return open(self.script_template, 'r').read() - def create_script(self, processing_info): + def create_script(self, processing_info, pool=None): lane = processing_info["libraries"][0] @@ -182,6 +196,12 @@ def create_script(self, processing_info): if alt_dir: fastq_directory = os.path.join(alt_dir, "fastq", "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"]) + if pool: + flowcell_dir = re.sub(r"/Project.*", "", lane["directory"]) + if alt_dir: + flowcell_dir=alt_dir + fastq_directory = os.path.join(flowcell_dir, "fastq", "Project_%s" % lane["project"], "LibraryPool_%s" % pool) + barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] try: # Preferred name diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 2792e519..d1abd319 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -66,7 +66,7 @@ def parser_setup(): def create_links( - lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False + lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False, is_pool=False, ): """ Create the links between the input directories and output dir @@ -84,10 +84,11 @@ def create_links( output_basedir, "Undetermined_indices", "Sample_lane1" ) else: + prefix = "LibraryPool" if is_pool else "Sample" output_dir = os.path.join( output_basedir, "Project_%s" % lane["project"], - "Sample_%s" % lane["samplesheet_name"], + "%s_%s" % (prefix, lane["samplesheet_name"]), ) short_name = re.sub(r"_", "-", short_name) @@ -146,9 +147,20 @@ def main(): } for read in ["R1", "R2"]: create_links( - undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, True + undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, undetermined=True ) + for pool in data["library_pools"].keys(): + lane = { + "samplesheet_name": pool, + "alignments": [{"sample_name": pool}], + "project": "Lab", + } + for read in ["R1", "R2"]: + create_links( + lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True + ) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index b2d78dd3..88bd7928 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -270,7 +270,7 @@ read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ _NOVA_BCL_CMD_ read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' -for fq_dir in fastq-withmask-* ; +for fq_dir in fastq-withmask-* ; do [[ -d $fq_dir ]] || continue python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_dir" -o Demultiplexed -p processing.json done @@ -695,6 +695,12 @@ rsync -avP "$samplesheet" "$analysis_dir" mkdir -p "\$destination" rsync -aL "\$dir/" "\$destination/" done + for dir in Project*/LibraryPool* ; do + destination=$analysis_dir + destination=\$destination/\$dir + mkdir -p "\$destination" + rsync -aL "\$dir/" "\$destination/" + done ) From 05c29141b755f24d74dcb3c05b55c66934cc287c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Aug 2023 16:14:15 -0700 Subject: [PATCH 078/172] Remove pylint dep to solve version conflict --- scripts/requirements.pip.txt.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/requirements.pip.txt.lock b/scripts/requirements.pip.txt.lock index 2f98b6f3..0403bdc6 100644 --- a/scripts/requirements.pip.txt.lock +++ b/scripts/requirements.pip.txt.lock @@ -66,7 +66,7 @@ pybind11==2.2.4 pycodestyle==2.5.0 pyflakes==2.1.1 Pygments==2.4.2 -pylint==2.4.4 +#pylint==2.4.4 pyparsing==2.1.5 PyQt5==5.12.2 PyQt5-sip==4.19.17 From 8383e424801acddfb1e210b53ca65f21a236fb91 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 8 Aug 2023 09:22:43 -0700 Subject: [PATCH 079/172] remove commented out config --- processes/altcode/scanpy.apptainer.def | 3 --- 1 file changed, 3 deletions(-) diff --git a/processes/altcode/scanpy.apptainer.def b/processes/altcode/scanpy.apptainer.def index 1fa5aaa4..f9cee600 100644 --- a/processes/altcode/scanpy.apptainer.def +++ b/processes/altcode/scanpy.apptainer.def @@ -1,9 +1,6 @@ Bootstrap: docker From: mambaorg/micromamba:1.4.9 -# %setup -# cat /home/nelsonjs/code/stampipes-altcode-dev/processes/altcode/scanpy.environment.yml > ${APPTAINER_ROOTFS}/env.yaml - %files $STAMPIPES/processes/altcode/scanpy.environment.yaml /env.yaml From 8e5b532f54fbed4f8b89f5ef6a2d16134139dc94 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 11:25:42 -0700 Subject: [PATCH 080/172] fix process_altcode.bash --- processes/altcode/process_altcode.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index bd3b296c..dffc7a3d 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,5 +1,5 @@ #!/bin/bash -set -euo pipefail +set -eo pipefail version=1.0.0-alpha1 cd "$(dirname "$0")" @@ -73,7 +73,7 @@ nextflow run \ -params-file "$params" \ -ansi-log false \ -with-trace \ - -workdir "$WORKDIR" \ + -work-dir "$WORKDIR" \ -profile cluster \ -resume From 1a1b389403808c8e6b4b1d143f62d9c1207e6359 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 11:26:12 -0700 Subject: [PATCH 081/172] Allow collate_fastq to handle altcode --- processes/fastq/collate_fastq.bash | 16 +- scripts/altcode/upload_fastq.py | 672 +++++++++++++++++++++++++++++ 2 files changed, 683 insertions(+), 5 deletions(-) create mode 100644 scripts/altcode/upload_fastq.py diff --git a/processes/fastq/collate_fastq.bash b/processes/fastq/collate_fastq.bash index 2056846b..8cdcc46a 100644 --- a/processes/fastq/collate_fastq.bash +++ b/processes/fastq/collate_fastq.bash @@ -26,11 +26,17 @@ R1_FILE=${FASTQ_NAME}_R1.fastq.gz R2_FILE=${FASTQ_NAME}_R2.fastq.gz function upload { - UPLOAD_SCRIPT="python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" - $UPLOAD_SCRIPT --attach_file_purpose r1-fastq --attach_file ${R1_FILE} - - if [ -e $R2_FILE ]; then - $UPLOAD_SCRIPT --attach_file_purpose r2-fastq --attach_file ${R2_FILE} + if [[ "$SAMPLE_NAME" == LP* ]] ; then + # Altcode sample, use dedicated script + python3 "$STAMPIPES/scripts/altcode/upload_fastq.py" --lane "$FLOWCELL_LANE_ID" --r1 "$R1_FILE" --r2 "$R2_FILE" + else + # Regular sample, upload old-style + UPLOAD_SCRIPT="python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" + $UPLOAD_SCRIPT --attach_file_purpose r1-fastq --attach_file "${R1_FILE}" + + if [ -e "$R2_FILE" ]; then + $UPLOAD_SCRIPT --attach_file_purpose r2-fastq --attach_file "${R2_FILE}" + fi fi } diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py new file mode 100644 index 00000000..38b26017 --- /dev/null +++ b/scripts/altcode/upload_fastq.py @@ -0,0 +1,672 @@ +#!/usr/bin/env python3 +""" +Uploads all the results of alt-seq processing to LIMS +""" + +import pprint +import re +import csv +import argparse +import datetime +import hashlib +import json +import logging +import os +import sys +from functools import lru_cache +from collections import defaultdict + +# Make sure we can load our vendored stamlims_api dependency +sys.path.insert( + 1, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "lims", "stamlims_api" + ), +) + + +from stamlims_api import rest # pylint: disable=wrong-import-position,import-error + +JSON_REPORT_CLASS_SLUG = "altseq-flowcell-report-starsolo" + +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +LOG = logging.getLogger("upload_data.py") + +script_options = { + "base_api_url": None, + "quiet": False, + "debug": False, + "dry_run": False, +} + + +class HashableDict(dict): + """ + A simple hashable dict + Helps cache our GET requests even w/ query params + """ + + def __hash__(self): + return hash(frozenset(self.items())) + + +def parser_setup(): + """Command-line argument setup""" + parser = argparse.ArgumentParser() + + run_opts = parser.add_argument_group("core params") + log_opts = parser.add_argument_group("logging options") + lims_opts = parser.add_argument_group("lims options") + + log_opts.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages (only WARN and higher).", + ) + log_opts.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages.", + ) + + lims_opts.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + lims_opts.add_argument( + "-t", "--token", dest="token", help="Your authentication token." + ) + + #run_opts.add_argument("sample_config", help="The sample_config.tsv file") + #run_opts.add_argument("processing_json", help="The processing.json file") + #run_opts.add_argument( + #"--output_file_directory", + #default=".", + #help="The output directory files are stored in. Defaults to cwd.", + #) + run_opts.add_argument("--r1", dest="r1_fastq", help="the r1 file to upload") + run_opts.add_argument("--r2", dest="r2_fastq", help="the r2 file to upload") + run_opts.add_argument("--lane", dest="lane_id", help="the ID of the lane") + #run_opts.add_argument("--flowcell", dest="flowcell_name", help="the name of the flowcell") + + run_opts.add_argument( + "--skip_md5", + dest="skip_md5", + action="store_true", + help="Don't calculate md5sum (debug/dev only)", + ) + + run_opts.add_argument( + "-n", + "--dry_run", + dest="dry_run", + action="store_true", + help="Do not upload anything to LIMS, instead print actions that would be taken", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) + + return parser + + +def md5sum_file(path): + """Calculates the md5sum of a file's contents""" + md5sum = hashlib.md5() + + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + md5sum.update(chunk) + + return md5sum.hexdigest() + + +#def parse_counts_file(counts_file: str): +# """ +# Given a file name, reads a stats file +# format: one stat per line: `name value` (separated by whitespace) +# returns a dict of str->int +# """ +# stats = {} +# with open(counts_file, "r") as counts: +# for line in counts: +# values = line.split() +# count_type_name = values[0] +# if not count_type_name: +# continue +# count = int(values[1]) +# stats[count_type_name] = count +# return stats +# +# +#def build_counts(alignment_id, counts_file): +# """ +# Convert stats into a form ready to be uploaded to LIMS with the +# bulk-stat-create endpoint +# """ +# parsed_stats = parse_counts_file(counts_file) +# return { +# "object_id": alignment_id, +# "content_type": "SequencingData.flowcelllanealignment", +# "stats": parsed_stats, +# } + + +class UploadLIMS: + """ + Contains the logic for uploading things to LIMS + Uses caching for most GET requests + """ + + def __init__(self, api_url, token, dry_run=False, skip_md5=False): + # self.count_types = {} + # self.flowcelllane_contenttype = None + # self.alignment_contenttype = None + # self.aggregation_contenttype = None + self.api = rest.setup_api( + { + rest.LIMS_URL_OPT_VAR: api_url, + rest.LIMS_TOKEN_OPT_VAR: token, + #rest.RAISE_ON_ERROR_VAR: True, + } + ) + self.dry_run = dry_run + self.skip_md5 = skip_md5 + + @lru_cache(maxsize=None) + def get(self, url): + """Cached version of api.get_single_result""" + return self.api.get_single_result(url) + + def get_by_id(self, base_url, object_id, err_message=None): + """Constructs url from ID and calls get""" + url = "%s/%d/" % (base_url, object_id) + result = self.get(url) + if not result: + if err_message is None: + err_message = "Failed to fetch %s" % url + LOG.critical(err_message) + return result + + @lru_cache(maxsize=None) + def _get_single_result(self, fetch_url, query=None, field=None): + """Internal memo-izable function, do not use directly""" + result = self.api.get_single_list_result( + url_addition=fetch_url, query_arguments=query + ) + if result is None: + return None + if field is not None: + return result[field] + return result + + def get_single_result(self, fetch_url, query=None, field=None): + """ + Using a list API url that should bring up a single item, retrieve that + single item if it exists. + """ + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + return self._get_single_result(fetch_url, query, field) + + # Not currently used + @lru_cache(maxsize=None) + def _get_list_result(self, url, query=None): + return self.api.get_list_result( + url_addition=url, + query_arguments=query, + item_limit=1000000, + page_size=1000, + ) + + def get_list_result(self, url, query=None): + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + LOG.debug("Query is now: %s", query) + return self._get_list_result(url, query) + + def put(self, *args, **kwargs): + """ + PUT data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have put %s, %s", args, kwargs) + return None + # FIXME: Should use PUT method once API lib supports it + return self.api.patch_single_result(*args, **kwargs) + + def post(self, *args, **kwargs): + """ + POST data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have post %s, %s", args, kwargs) + return None + return self.api.post_single_result(*args, **kwargs) + + def patch(self, *args, **kwargs): + if self.dry_run: + LOG.info("Dry run, would have patch %s, %s", args, kwargs) + return None + return self.api.patch_single_result(*args, **kwargs) + + # def get_flowcell_url_by_label(self, label): + # return self.get_single_result( + # "flowcell_run/", field="url", query={"label": label} + # ) + + def get_contenttype(self, contenttype_name): + """ + Appname uses capitalization, modelname does not. + """ + + (appname, modelname) = contenttype_name.split(".") + + query = { + "app_label": appname, + "model": modelname, + } + ct = self.get_single_result("content_type/", query=query) + if not ct: + LOG.critical("Could not fetch content type %s", contenttype_name) + + return ct + + def get_file_purpose_url(self, slug): + """Get file purpose url from slug""" + return self.get_single_result( + "file_purpose/", query={"slug": slug}, field="url" + ) + + def get_file_type_url(self, slug): + """Gets the file type URL for a slug""" + return self.get_single_result("file_type/", field="url", query={"slug": slug}) + + def upload_directory_attachment( + self, path, contenttype_name, object_id, file_purpose=None + ): + """Uploads a single directory to a LIMS object""" + path = os.path.abspath(path) + if not (contenttype_name and object_id): + LOG.error( + "Cannot attach file %s without both content type and object_id", path + ) + return False + + contenttype = self.get_contenttype(contenttype_name) + if not contenttype: + LOG.error("Cannot attach file %s without contenttype result", path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + if file_purpose and not purpose: + LOG.error( + "Could not find file purpose %s for uploading directory %s", + file_purpose, + path, + ) + return False + LOG.debug("File purpose: %s", purpose) + + existing_data = self.get_single_result("directory/", query={"path": path}) + data = existing_data if existing_data else {} + + data.update( + { + "path": path, + "content_type": contenttype["url"], + "object_id": object_id, + "purpose": purpose, + } + ) + + if existing_data: + LOG.info("Updating information for directory %s", path) + result = self.put(url=data["url"], data=data) + else: + LOG.info("Uploading information for directory %s", path) + result = self.post("directory/", data=data) + + if not result: + LOG.error("Could not upload directory %s", path) + LOG.debug(data) + else: + LOG.debug(result) + + return True + + def upload_files(self, r1, r2, lane_id): + lane_ids = self.get_lane_ids(lane_id) + self.upload_file(r1, "SequencingData.flowcelllane", lane_ids, file_purpose="r1-fastq", file_type="gzipped-fastq") + self.upload_file(r2, "SequencingData.flowcelllane", lane_ids, file_purpose="r2-fastq", file_type="gzipped-fastq") + + def upload_file( + self, path, contenttype_name, object_ids, file_purpose=None, file_type=None + ): + """ + Upload a file's metadata to LIMS + It will be attached to many objects. + """ + # FIXME: This method makes a GET and PUT request for every single object + # Will require LIMS API updates to enable a more performant solution + + upload_data = self.get_file_upload_data( + path, contenttype_name, file_purpose, file_type + ) + LOG.debug("Uploading file %s, to %d objects", path, len(object_ids)) + if self.skip_md5: + LOG.info("Skipping md5sum") + upload_data["md5sum"] = "0" + else: + LOG.debug("Running md5sum...") + upload_data["md5sum"] = md5sum_file(path) + + content_type_id = re.search(r"(\d+)/?$", upload_data["content_type"]).group(1) + purpose_id = re.search(r"(\d+)/?$", upload_data["purpose"]).group(1) + for object_id in object_ids: + upload_data.update({"object_id": object_id}) + exists = self.get_single_result( + "file/", + query={ + "object_id": object_id, + "purpose": purpose_id, + "content_type": content_type_id, + }, + ) + + if exists: + if exists == upload_data: + LOG.info( + "No change to information for file %s, lane %d, not updating", + path, + object_id, + ) + result = True + else: + LOG.info( + "Updating information for file %s: lane %d", path, object_id + ) + result = self.put(url=exists["url"], data=upload_data) + else: + LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + result = self.post("file/", data=upload_data) + + if not result: + LOG.error("Could not upload file %s for ID %d", path, object_id) + LOG.debug(upload_data) + else: + LOG.debug(result) + + def get_file_upload_data( + self, path, contenttype_name, file_purpose=None, file_type=None + ): + """ + Gets the file upload data that is easy to query + (notable omission: md5sum, as it takes a long time to calculate) + """ + path = os.path.abspath(path) + + contenttype = self.get_contenttype(contenttype_name) + if not contenttype: + LOG.error("Cannot attach file %s without contenttype result", path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + if file_purpose and not purpose: + LOG.error( + "Could not find file purpose %s for uploading file %s", + file_purpose, + path, + ) + return False + if purpose: + LOG.debug("File Purpose: %s", purpose) + + ftype = self.get_file_type_url(file_type) + if file_type and not ftype: + LOG.error( + "Could not find file type %s for uploading file %s", file_type, path + ) + return False + if file_type: + LOG.debug("File Type: %s", ftype) + + file_size = os.path.getsize(path) + last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + + # Current issue: sub-second precision. + data = { + "path": path, + "content_type": contenttype["url"], + "purpose": purpose, + "filetype": ftype, + "file_last_modified": last_modified, + "size_bytes": file_size, + } + + LOG.debug(data) + return data + + def get_flowcell_lane(self, flowcell_lane_id): + """Gets the flowcell lane by ID""" + return self.get_by_id("flowcell_lane", flowcell_lane_id) + + def get_library(self, library_id): + """Gets the library by ID (NOT library number)""" + return self.get_by_id("library", library_id) + + + # gets the other lane ids for this lane/pool + def get_lane_ids(self, lane_id): + def extract_id_from_url(url): + return re.sub(r'[^\d]', "", url) + lane_info = self.get_by_id("flowcell_lane", int(lane_id)) + pool_info = self.api.get_single_result(url=lane_info["library_pool"]) + lib_ids = [] + flowcell_id = extract_id_from_url(lane_info["flowcell"]) + for lib_url in pool_info["libraries"]: + lib_id = extract_id_from_url(lib_url) + lib_ids.append(lib_id) + + lanes_query = "flowcell_lane/?flowcell=%s&lane=%d&page_size=1000" % ( + flowcell_id, + lane_info["lane"], + ) + lane_info = self.get_list_result(lanes_query) + lanes_in_pool = [] + for l in lane_info: + library_id = extract_id_from_url(l["library"]) + if library_id in lib_ids: + lanes_in_pool.append(l["id"]) + return lanes_in_pool + + + #def upload_flowcell_report(self, data): + # flowcell_labels = set(pool["flowcell_label"] for pool in data) + # assert len(flowcell_labels) == 1 + # flowcell_label = flowcell_labels.pop() + + # report_name = "Alt-seq stats: FC%s" % flowcell_label + + # flowcell_lims_info = self.get_single_result( + # "flowcell_run/?label=%s" % flowcell_label) + # content_type_id = flowcell_lims_info['object_content_type'] + # content_type = self.get_by_id("content_type", content_type_id) + # object_id = flowcell_lims_info['id'] + # json_report_class = self.get_single_result( + # "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) + + # # See if report already exists + # existing_reports = self.get_list_result("json_report/", query={ + # "object_id": object_id, + # "content_type": content_type["id"], + # "report_class": json_report_class["id"], + # "page_size": 2, + # }) + + # data_to_send = { + # "object_id": object_id, + # "content_type": content_type["url"], + # "report_class": json_report_class["url"], + # "name": report_name, + # "json_content": json.dumps(data), + # } + # if len(existing_reports) == 0: + # self.post("json_report/", data=data_to_send) + # # No report exists yet, upload a new one + # elif len(existing_reports) == 1: + # # Exactly one report, update it + # url_to_patch = "json_report/%d/" % existing_reports[0]["id"] + # self.patch(url_to_patch, data=data_to_send) + # else: + # # Error! too many reports + # LOG.critical("Too many JSON reports exist") + # raise "Too many JSON reports exist, exiting" + + + #def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): + # """ + # Main function for this script. + # Given paths to the sample_config file, processing_dict, and outdir, + # upload to LIMS: + # 1) Paths for fastq files for each lane + # # 2) Stats for each alignment + # 3) Flowcell-level pool stats + # """ + # # (Filepath, purpose) -> [lane_ids] + # files_to_upload = defaultdict(list) + + # # Augment processing_dict with sample_config info + # processing_info = [] + # for row in sample_config: + # barcode_index = row["barcode_index"] + # lane = int(row["lane"]) + # pool_name = row["pool_name"] + # sample_name = row["sample_name"] + # for idx, lib in enumerate(processing_dict["libraries"]): + # if int(lib["lane"]) == lane and lib["barcode_index"] == barcode_index: + # lib.update({"pool_name": pool_name, "sample_name": sample_name}) + # processing_info.append(lib) + + # # TODO: Doesn't yet make use of the above augmented info + # for row in sample_config: + # (idx, _otheridx) = row["barcode_index"].split("-") + # lane = int(row["lane"]) + # name = row["pool_name"] + # LOG.debug("idx=%s, lane=%d, name=%s", idx, lane, name) + # # Get lane IDs for each file + # lane_ids = [ + # l["id"] + # for l in processing_dict["libraries"] + # if l["barcode1"]["reverse_sequence"] == idx and int(l["lane"]) == lane + # ] + # r1_file = os.path.join(outdir, name, "R1.fq.gz") + # r2_file = os.path.join(outdir, name, "R2.fq.gz") + # if not os.path.exists(r1_file): + # raise Exception("No file %s" % r1_file) + # if not os.path.exists(r2_file): + # raise Exception("No file %s" % r2_file) + + # files_to_upload[(r1_file, "r1-fastq")].extend(lane_ids) + # files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) + + # # Upload files. + # for ((path, purpose), lane_ids) in files_to_upload.items(): + # # print(path, purpose, len(lane_ids)) + # self.upload_file( + # path, + # "SequencingData.flowcelllane", + # list(set(lane_ids)), + # file_purpose=purpose, + # file_type="fastq", + # ) + + # # Commented out because we aren't making alignments for these... + # # # Now upload counts. + # # # We can do this all as one call. + # # # (Assuming LIMS doesn't time out) + # # all_counts = [] + # # for lib in processing_info: + # # if not len(lib["alignments"]) == 1: + # # LOG.critical("Lib must have exactly 1 aligment %s", lib) + # # align_id = lib["alignments"][0]["id"] + # # counts_file = os.path.join( + # # outdir, + # # lib["pool_name"], + # # "analysis", + # # "Gene", + # # "%s.stats.txt" % lib["sample_name"], + # # ) + # # all_counts.append(build_counts(align_id, counts_file)) + # # # print(json.dumps(all_counts)) + # # self.post("stats/create/", all_counts) + + # with open(os.path.join(outdir, "flowcell_stats.json")) as json_file: + # flowcell_data = json.loads(json_file.read()) + # self.upload_flowcell_report(flowcell_data) + + +def main(): + """ + This is the main body of the program that uses the arguments from the + command line. + """ + + parser = parser_setup() + poptions = parser.parse_args() + + if poptions.quiet: + logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT) + elif poptions.debug: + logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) + else: + # Set up the default logging levels + logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) + # Make this a little less noisy by default + requests_log = logging.getLogger("requests.packages.urllib3.connectionpool") + requests_log.setLevel(logging.WARN) + + if not poptions.base_api_url and "LIMS_API_URL" in os.environ: + api_url = os.environ["LIMS_API_URL"] + LOG.debug("Using LIMS API endpoint: %s from environment", api_url) + elif poptions.base_api_url: + api_url = poptions.base_api_url + LOG.debug("Using LIMS API endpoint: %s from options", api_url) + else: + sys.stderr.write("Could not find LIMS API URL.\n") + sys.exit(1) + + if not poptions.token and "LIMS_API_TOKEN" in os.environ: + token = os.environ["LIMS_API_TOKEN"] + elif poptions.token: + token = poptions.token + else: + sys.stderr.write("Could not find LIMS API TOKEN.\n") + sys.exit(1) + + uploader = UploadLIMS( + api_url, token, dry_run=poptions.dry_run, skip_md5=poptions.skip_md5 + ) + + uploader.upload_files(poptions.r1_fastq, poptions.r2_fastq, poptions.lane_id) + + #with open(poptions.sample_config) as f: + # sample_config = list(csv.DictReader(f, delimiter="\t")) + #with open(poptions.processing_json) as f: + # processing = json.loads(f.read()) + #uploader.upload_altseq_flowcell( + # sample_config, processing, poptions.output_file_directory + #) + + +# This is the main body of the program that only runs when running this script +# doesn't run when imported, so you can use the functions above in the shell +# after importing without automatically running it +if __name__ == "__main__": + main() From 29d34732fbc46e42f1f15f4fba8df820714e6733 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 11:30:07 -0700 Subject: [PATCH 082/172] Fix naming of LibraryPool fastq files Include barcode and lane, to match convention for our sample fastq files. --- scripts/apilaneprocess.py | 51 +++++++++++++++++++++++++------ scripts/flowcells/link_nextseq.py | 36 ++++++++++++++++------ 2 files changed, 67 insertions(+), 20 deletions(-) diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 0abee608..670bfdb3 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -16,6 +16,9 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +POOL_INFO = {} +SCRIPTS_WRITTEN = set() + STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') script_options = { @@ -86,7 +89,7 @@ def __init__(self, args, api): self.dry_run = args.dry_run self.no_mask = args.no_mask - self.pool = ThreadPoolExecutor(max_workers=10) + self.pool = ThreadPoolExecutor(max_workers=6) def get_lane_process_info(self, lane_id): @@ -114,7 +117,7 @@ def get_process_template(self, process_template_id): def setup_flowcell(self, flowcell_label): - lanes = self.api.get_list_result(url_addition="flowcell_lane", query_arguments={"flowcell__label": flowcell_label}, page_size=1000, item_limit=10000) + lanes = self.api.get_list_result(url_addition="flowcell_lane/", query_arguments={"flowcell__label": flowcell_label}, page_size=1000, item_limit=50000) if not lanes: logging.error("Flowcell %s has no lanes" % flowcell_label) @@ -142,7 +145,9 @@ def setup_lanes(self, lane_ids): if len(lane_ids) != len(set(lane_ids)): logging.warning("Duplicate lane IDs! %s " % [item for item, count in collections.Counter(lane_ids).items() if count > 1]) - self.pool.map(self.setup_lane, lane_ids) + #self.pool.map(self.setup_lane, lane_ids) + for lane_id in lane_ids: + self.setup_lane(lane_id) def setup_lane(self, lane_id): @@ -153,20 +158,40 @@ def setup_lane(self, lane_id): pool_name = None try: lib_number = processing_info["libraries"][0]["library"] - library_info = self.api.get_single_result(url_addition="library/?number=%d/" % lib_number) + library_info = self.api.get_single_result(url_addition="library/?number=%d" % lib_number)["results"][0] + logging.debug("Info is %s", library_info) pools = library_info["librarypools"] if pools: pool_name = pools[0]["object_name"] - logging.debug("Setting up script for pool %s", pool_name) + pool_id = pools[0]["id"] + logging.debug("Lane %d is pool %s", lib_number, pool_name) + else: + logging.debug("Lane %d is not pool", lib_number) except: pass - # Check if in pool + global POOL_INFO + if pool_name and pool_name not in POOL_INFO: + pool_data = self.api.get_single_result(url_addition="library_pool/%d/" % pool_id) + bc1 = None + bc2 = None + if pool_data["barcode1"]: + bc1 = self.api.get_single_result(url=pool_data["barcode1"])["reverse_sequence"] + if pool_data["barcode2"]: + bc2 = self.api.get_single_result(url=pool_data["barcode2"])["reverse_sequence"] + barcode = "_".join(bc for bc in [bc1, bc2] if bc) + POOL_INFO[pool_name] = {"barcode": barcode} - self.create_script(processing_info) + self.create_script(processing_info, pool_name) def add_script(self, script_file, lane_id, flowcell_label, sample_name): + # Hacks to deduplicate files written for library pools + global SCRIPTS_WRITTEN + if script_file in SCRIPTS_WRITTEN: + return + SCRIPTS_WRITTEN.add(script_file) + if not self.outfile: logging.debug("Writing script to stdout") outfile = sys.stdout @@ -175,7 +200,7 @@ def add_script(self, script_file, lane_id, flowcell_label, sample_name): outfile = open(self.outfile, 'a') outfile.write("cd %s && " % os.path.dirname(script_file)) - fullname = "%s%s-%s-Lane#%d" % (self.qsub_prefix,sample_name,flowcell_label,lane_id) + fullname = "%s%s-%s-Lane#%d" % (self.qsub_prefix, sample_name, flowcell_label, lane_id) outfile.write("sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=8000 --parsable --oversubscribe <<__LANEPROC__\n#!/bin/bash\nbash %s\n__LANEPROC__\n\n" % (fullname, fullname, fullname, self.queue, script_file)) outfile.close() @@ -200,7 +225,7 @@ def create_script(self, processing_info, pool=None): flowcell_dir = re.sub(r"/Project.*", "", lane["directory"]) if alt_dir: flowcell_dir=alt_dir - fastq_directory = os.path.join(flowcell_dir, "fastq", "Project_%s" % lane["project"], "LibraryPool_%s" % pool) + fastq_directory = os.path.join(flowcell_dir, "Project_%s" % lane["project"], "LibraryPool_%s" % pool) barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] try: @@ -211,9 +236,15 @@ def create_script(self, processing_info, pool=None): spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) logging.warning("No alignment sample_name for lane, using %s instead" % spreadsheet_name) + if pool: + global POOL_INFO + barcode = POOL_INFO[pool]["barcode"] + spreadsheet_name = "%s_%s_L00%d" % (pool, barcode, lane['lane']) + #print("DBG:", pool, spreadsheet_name, POOL_INFO) + if not os.path.exists(fastq_directory): logging.critical("fastq directory %s does not exist, cannot continue" % fastq_directory) - return False + return False script_file = os.path.join( fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) ) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index d1abd319..e027f062 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -7,6 +7,7 @@ import logging import os import re +from collections import defaultdict LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -150,16 +151,31 @@ def main(): undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, undetermined=True ) - for pool in data["library_pools"].keys(): - lane = { - "samplesheet_name": pool, - "alignments": [{"sample_name": pool}], - "project": "Lab", - } - for read in ["R1", "R2"]: - create_links( - lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True - ) + # Set up conversion table + libs_to_lanes = defaultdict(set) + for lane in data["libraries"]: + libs_to_lanes[lane['library']].add(lane['lane']) + + for (pool, info) in data["library_pools"].items(): + barcode = info["barcode1"] + if info.get("barcode2"): + barcode = "%s_%s" % (barcode, info["barcode2"]) + lane_nums = set() + for lib in info["libraries"]: + lib_num = int(re.sub(r'[^\d]+', '', lib)) + lane_nums.update( libs_to_lanes[lib_num] ) + + for lane_num in sorted(lane_nums): + out_name = "%s_%s_L00%d" % (pool, barcode, lane_num) + lane = { + "samplesheet_name": pool, + "alignments": [{"sample_name": out_name}], + "project": "Lab", + } + for read in ["R1", "R2"]: + create_links( + lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True + ) # This is the main body of the program that only runs when running this script From 133cf1c5edf0d94bb8959589e6c104e20b8b856f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 11:31:20 -0700 Subject: [PATCH 083/172] Add poolprocess.py, analogous to alignprocess.py --- scripts/flowcells/setup.sh | 4 + scripts/poolprocess.py | 551 +++++++++++++++++++++++++++++++++++++ 2 files changed, 555 insertions(+) create mode 100644 scripts/poolprocess.py diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 88bd7928..6f9a52ef 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -696,6 +696,7 @@ rsync -avP "$samplesheet" "$analysis_dir" rsync -aL "\$dir/" "\$destination/" done for dir in Project*/LibraryPool* ; do + [[ -d \$dir ]] || continue destination=$analysis_dir destination=\$destination/\$dir mkdir -p "\$destination" @@ -778,11 +779,14 @@ python3 "$STAMPIPES/scripts/alignprocess.py" \ --qsub-queue queue0 \ --outfile run_alignments.bash +python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash + # Set up of flowcell aggregations curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" # Run alignments bash run_alignments.bash +bash run_pools.bash __COLLATE__ diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py new file mode 100644 index 00000000..e037bf9e --- /dev/null +++ b/scripts/poolprocess.py @@ -0,0 +1,551 @@ +import json +import os +import sys +import argparse +import logging +import re +import requests +import textwrap +from collections import OrderedDict, defaultdict +try: + from concurrent.futures import ThreadPoolExecutor +except ImportError: + from futures import ThreadPoolExecutor + +log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') + +script_options = { + "quiet": False, + "debug": False, + "base_api_url": None, + "token": None, + "flowcell": None, + "tag": None, + "outfile": os.path.join(os.getcwd(), "run.bash"), + "sample_script_basename": "run.bash", + "qsub_queue": "hpcz-2", + "qsub_prefix": ".proc", + "dry_run": False, + "no_mask": False, + "redo_completed": False, + "qsub_priority": -50, + "auto_aggregate": False, +} + +def parser_setup(): + + parser = argparse.ArgumentParser() + + parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", + help="Don't print info messages to standard out.") + parser.add_argument("-d", "--debug", dest="debug", action="store_true", + help="Print all debug messages to standard out.") + + parser.add_argument("-a", "--api", dest="base_api_url", + help="The base API url, if not the default live LIMS.") + parser.add_argument("-t", "--token", dest="token", + help="Your authentication token. Required.") + + #parser.add_argument("--alignment", dest="align_ids", type=int, action="append", + # help="Run for this particular alignment.") + parser.add_argument("--flowcell", dest="flowcell_label", + help="Run for this particular flowcell label.") + parser.add_argument("--pool", dest="pool", + help="Run for this particular pool.") + #parser.add_argument("--tag", dest="tag", + # help="Run for alignments tagged here.") + #parser.add_argument("--project", dest="project", + # help="Run for alignments in this project.") + + parser.add_argument("--script_template", dest="script_template", + help="The script template to use.") + parser.add_argument("--qsub_priority", dest="qsub_priority", type=int, + help="The priority to give scripts we are submitting.") + + parser.add_argument("-o", "--outfile", dest="outfile", + help="Append commands to run this alignment to this file.") + parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", + help="Name of the script that goes after the sample name.") + parser.add_argument("--qsub-prefix", dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") + parser.add_argument("--qsub-queue", dest="qsub_queue", + help="Name of the SLURM partition") + parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", + help="Take no action, only print messages.") + parser.add_argument("--no-mask", dest="no_mask", action="store_true", + help="Don't use any barcode mask.") + parser.add_argument("--redo_completed", dest="redo_completed", help="Redo alignments marked as completed.", + action="store_true") + #parser.add_argument("--auto_aggregate", dest="auto_aggregate", help="Script created will also run auto-aggregations after alignments finished.", + #action="store_true") + parser.add_argument("--align_base_dir", dest="align_base_dir", help="Create the alignment directory in this directory") + + parser.add_argument("--listout", dest="simple_output", help="Write only a list of alignments to run, rather than a script to submit them", action="store_true") + + parser.set_defaults( **script_options ) + parser.set_defaults( quiet=False, debug=False ) + + return parser + + +class ProcessSetUp(object): + + def __init__(self, args, api_url, token): + + self.token = token + self.api_url = api_url + self.qsub_scriptname = args.sample_script_basename + self.qsub_prefix = args.qsub_prefix + self.outfile = args.outfile + self.dry_run = args.dry_run + self.no_mask = args.no_mask + self.redo_completed = args.redo_completed + self.script_template = args.script_template + self.qsub_priority = args.qsub_priority + self.qsub_queue = args.qsub_queue + #self.auto_aggregate = args.auto_aggregate + self.align_base_dir = args.align_base_dir + + self.simple_output = args.simple_output + + self.session = requests.Session() + self.session.headers.update({'Authorization': "Token %s" % self.token}) + + self.pool = ThreadPoolExecutor(max_workers=10) + + def api_single_result(self, url_addition=None, url=None): + + if url_addition: + url = "%s/%s" % (self.api_url, url_addition) + + request = self.session.get(url) + + if request.ok: + logging.debug(request.json()) + return request.json() + else: + logging.error("Could not get data from %s" % url) + logging.error(request) + return None + + def api_list_result(self, url_addition=None, url=None): + + more = True + results = [] + + if url_addition: + url = "%s/%s" % (self.api_url, url_addition) + + while more: + + logging.debug("Fetching more results for query %s" % url) + + request = self.session.get(url) + + if not request.ok: + logging.error(request) + return None + more_results = request.json() + results.extend(more_results["results"]) + if more_results["next"]: + url = more_results["next"] + else: + more = False + + return results + + def get_align_process_info(self, alignment_id): + + process_info = self.api_single_result("flowcell_lane_alignment/%d/processing_information/" % alignment_id) + + if not process_info: + logging.critical("Could not find processing info for alignment %d\n" % alignment_id) + logging.critical(process_info) + sys.exit(1) + + return process_info + + def get_process_template(self, align_id, process_template_id): + + if not process_template_id: + logging.critical("No process template for alignment %d\n" % align_id) + return None + + info = self.api_single_result("process_template/%d/" % (process_template_id)) + + if not info: + logging.critical("Could not find processing template for ID %d\n" % process_template_id) + sys.exit(1) + + return info + + # Run alignment setup in parallel + def setup_alignments(self, align_ids): + for id, error in self.pool.map(self.setup_alignment, align_ids): + if error: + logging.debug("ALN%d result received, error: %s" % (id, error)) + else: + logging.debug("ALN%d result received, OK" % id) + + def setup_alignment(self, align_id): + + try: + processing_info = self.get_align_process_info(align_id) + alignment = self.api_single_result("flowcell_lane_alignment/%d/" % (align_id)) + + if self.redo_completed or not alignment['complete_time']: + self.create_script(processing_info, alignment["id"]) + return (align_id, None) + else: + logging.info("Skipping completed alignment %d" % align_id) + return (align_id, None) + except Exception as e: + logging.exception("Could not set up alignment %d}: (%s)" % (align_id, e)) + return (align_id, e) + + def get_lane_file(self, lane_id, purpose): + candidates = self.api_list_result("file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id)) + + if not candidates: + return None + if len(candidates) > 1: + return None + + return candidates[0] + + def setup_tag(self, tag_slug): + + align_tags = self.api_list_result("tagged_object/?content_type=47&tag__slug=%s" % tag_slug) + + self.setup_alignments([align_tag["object_id"] for align_tag in align_tags]) + + def setup_project(self, project_id): + logging.info("Setting up project #%s" % project_id) + alignments = self.api_list_result("flowcell_lane_alignment/?lane__sample__project=%s" % project_id) + self.setup_alignments([alignment["id"] for alignment in alignments]) + + def setup_flowcell(self, flowcell_label): + logging.info("Setting up flowcell for %s" % flowcell_label) + align_ids = self.get_alignment_ids(flowcell_label) + + logging.debug("align ids: %s", align_ids) + #alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + self.setup_alignments(align_ids) + + def get_alignment_ids(self, flowcell_label: str) -> [int]: + """ + For each librarypool/lane combination on the flowcell: + Pick one representative alignment (the one with the lowest ID) + return the IDs of those flowcells + """ + + def extract_id_from_url(url): + return int(re.findall(r'\d+', url)[-1]) + + # Storage for the 3 layers of mapping between alignments and pools + pool_key_to_lib_ids = defaultdict(list) # {(pool_id, lane_number): [lib_id]} + lib_id_to_lane_ids = defaultdict(list) # {lib_id: [lane_ids]} + lane_id_to_aln_ids = defaultdict(list) # {lane_id: [aln_ids]} + + library_info = set() + for lane in self.api_list_result("flowcell_lane/?flowcell__label=%s&page_size=1000" % flowcell_label): + lib_url = lane['library'] + lane_lane = lane['lane'] + library_info.add((lib_url, lane_lane)) + lib_id = extract_id_from_url(lib_url) + lib_id_to_lane_ids[lib_id].append(lane['id']) + + # Set of poolnums + lane + pool_info = set() + for info in library_info: + lib_info = self.api_single_result(url=info[0]) + for pool in lib_info['librarypools']: + pool_info.add((pool["number"], info[1])) + key = (pool["id"], info[1]) + pool_key_to_lib_ids[key].append(lib_info['id']) + + all_alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + for aln in all_alignments: + lane_id = extract_id_from_url(aln['lane']) + lane_id_to_aln_ids[lane_id].append(aln['id']) + + # Find the minimum alignment ID for each pool/lane combination + lowest_aln_for_pool = {pool_key: None for pool_key in pool_key_to_lib_ids.keys()} + for (pool_key, lib_ids) in pool_key_to_lib_ids.items(): + for lib_id in lib_ids: + for lane_id in lib_id_to_lane_ids[lib_id]: + for aln_id in lane_id_to_aln_ids[lane_id]: + cur_aln = lowest_aln_for_pool[pool_key] + logging.debug("%s, %d, %d, %d < %d?", + pool_key, lib_id, lane_id, aln_id, cur_aln) + if cur_aln is None or cur_aln > aln_id: + lowest_aln_for_pool[pool_key] = aln_id + + return list(lowest_aln_for_pool.values()) + + + + + #def auto_aggregation_script(self,flowcell_label,alignments): + # aaname_sentinel = "auto_agg_sentinel.%s" % (flowcell_label) + + # if not self.outfile: + # logging.debug("Writing script to stdout") + # outfile = sys.stdout + # else: + # logging.debug("Logging script to %s" % self.outfile) + # outfile = open(self.outfile, 'a') + + # contents = textwrap.dedent("""\ + # cd "$FLOWCELLS"/FC{label}_* + # sentinel_dependencies=$(echo $PROCESSING | sed -e 's/,/,afterany:/g' | sed -e 's/^,afterany/--dependency=afterany/g') + # sbatch --export=ALL -J {job_name} -o {job_name}.o%A -e {job_name}.e%A --partition={queue} --cpus-per-task=1 --ntasks=1 $sentinel_dependencies --mem-per-cpu=1000 --parsable --oversubscribe <<__AUTOAGG1__ + # #!/bin/bash + # rm -f run_aggregations.bash + # python $STAMPIPES/scripts/aggregateprocess.py --flowcell {label} --outfile run_aggregations.bash --qsub-queue {qqueue} + # bash run_aggregations.bash + # __AUTOAGG1__ + # """.format(label=flowcell_label, + # job_name=aaname_sentinel, + # queue=self.qsub_queue, + # qqueue=self.qsub_queue)) + + # outfile.write(contents) + # outfile.close() + + def add_script(self, align_id, processing_info, script_file, sample_name): + + ram_megabytes = 2000 + + if not self.outfile: + logging.debug("Writing script to stdout") + outfile = sys.stdout + else: + logging.debug("Logging script to %s" % self.outfile) + outfile = open(self.outfile, 'a') + + if self.simple_output: + outfile.write(script_file + "\n") + else: + outfile.write("cd %s && " % os.path.dirname(script_file)) + fullname = "%s%s-%s-ALIGN#%d" % (self.qsub_prefix,sample_name,processing_info['flowcell']['label'],align_id) + outfile.write("jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING=\"$PROCESSING,$jobid\"\n\n" % (fullname, fullname, fullname, self.qsub_queue, ram_megabytes, script_file)) + outfile.close() + + def get_script_template(self, process_template): + + if self.script_template: + script_path = self.script_template + else: + script_path = os.path.expandvars(process_template["process_version"]["script_location"]) + return open(script_path, 'r').read() + + def create_script(self, processing_info, align_id): + + lane = processing_info["libraries"][0] + alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] + + if not "process_template" in alignment: + logging.error("Alignment %d has no process template" % align_id) + return False + + process_template = self.get_process_template(align_id, alignment["process_template"]) + + if not process_template: + return False + + flowcell_directory = processing_info['flowcell']['directory'] + + share_dir = lane.get("project_share_directory") + if share_dir: + flowcell_directory = os.path.join(share_dir, "alignments") + if not flowcell_directory: + logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) + return False + + lib_info = self.api_single_result("library/?number=%d" % processing_info["libraries"][0]["library"])["results"][0] + logging.debug("lib info is %s", lib_info) + pool_name = lib_info["librarypools"][0]["object_name"] + logging.debug("pool is %s", pool_name) + + fastq_directory = os.path.join(flowcell_directory, "Project_%s" % lane['project'], "LibraryPool_%s" % pool_name) + + # Reset the alignment's sample name if we decied not to use the barcode index mask + if self.no_mask: + alignment['sample_name'] = "%s_%s_L00%d" % (lane['samplesheet_name'], lane['barcode_index'], lane['lane']) + + align_dir = "align_%d_%s_%s" % (alignment['id'], alignment['genome_index'], alignment['aligner']) + if alignment['aligner_version']: + align_dir = "%s-%s" % (align_dir, alignment['aligner_version']) + + script_directory = os.path.join(fastq_directory, align_dir) + if self.align_base_dir: + script_directory = os.path.join(self.align_base_dir, align_dir) + + r1_fastq = self.get_lane_file(lane["id"], "r1-fastq") + + if not r1_fastq: + logging.error("Missing r1-fastq for lane %d (alignment %d) - check dir %s" % (lane["id"], alignment["id"], fastq_directory)) + return False + + if processing_info['flowcell']['paired_end']: + r2_fastq = self.get_lane_file(lane["id"], "r2-fastq") + if not r2_fastq: + logging.error("Missing r2-fastq for lane %d (alignment %d)" % (lane["id"], alignment["id"])) + return False + + script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) + logging.info("Will write to %s" % script_file) + + + # Set up & add environment variables + env_vars = OrderedDict() + + env_vars["SAMPLE_NAME"] = alignment['sample_name'] + env_vars["BWAINDEX"] = alignment['genome_index_location'] + env_vars["GENOME"] = alignment['genome_index'] + env_vars["ASSAY"] = lane['assay'] + env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] + if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0]['library_kit_method']: + env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' + else: + env_vars["LIBRARY_KIT"] = None + + if processing_info['flowcell']['paired_end']: + env_vars["PAIRED"] = "True" + else: + env_vars["PAIRED"] = None + + env_vars["FLOWCELL_LANE_ID"] = lane['id'] + env_vars["ALIGNMENT_ID"] = alignment['id'] + env_vars["ALIGN_DIR"] = os.path.join(fastq_directory, align_dir) + env_vars["R1_FASTQ"] = r1_fastq["path"] + + if processing_info['flowcell']['paired_end']: + env_vars["R2_FASTQ"] = r2_fastq["path"] + + env_vars["FASTQ_DIR"] = fastq_directory + env_vars["FLOWCELL"] = processing_info['flowcell']['label'] + + if "barcode1" in lane and lane["barcode1"]: + p7_adapter = lane['barcode1']['adapter7'] + p5_adapter = lane['barcode1']['adapter5'] + if "barcode2" in lane and lane['barcode2']: + # Override the "default" end adapter from barcode1 + p5_adapter = lane['barcode2']['adapter5_reverse_complement'] + + if not p7_adapter or not p5_adapter: + logging.warn("Alignment %d missing adapters, some processes might not work" % alignment['id']) + + env_vars["ADAPTER_P7"] = p7_adapter + env_vars["ADAPTER_P5"] = p5_adapter + + # Process with UMI if the barcode has one and this is a dual index + # flowcell + if lane['barcode1']['umi'] and processing_info['flowcell']['dual_index']: + env_vars["UMI"] = "True" + else: + env_vars["UMI"] = None + env_vars["UMI_METHOD"] = lane['barcode1']['umi_method'] + + # Set process template env var overrides + if 'process_variables' in process_template and process_template['process_variables']: + try: + process_template_variables = json.loads(process_template['process_variables'], + object_pairs_hook=OrderedDict) + for var, value in process_template_variables.items(): + env_vars[var] = value + except ValueError as e: + logging.error("Could not parse process variables for align %d (template %d): '%s'" % + ( + alignment['id'], + process_template['id'], + process_template['process_variables'] + )) + return False + + if self.dry_run: + logging.info("Dry run, would have created: %s" % script_file) + logging.debug(env_vars) + return True + + if not os.path.exists(script_directory): + logging.info("Creating directory %s" % script_directory) + os.makedirs(script_directory) + + # Append to master script + self.add_script(align_id, processing_info, script_file, alignment['sample_name']) + + # Write file + outfile = open(script_file, 'w') + outfile.write("set -e -o pipefail\n") + + # Set env vars + for var, value in env_vars.items(): + if value is not None: + outfile.write("export %s=%s\n" % (var, value)) + else: + outfile.write("unset %s\n" % var) + + outfile.write("\n") + outfile.write("export QUEUE=%s\n" % (self.qsub_queue)) + outfile.write("\n") + outfile.write(self.get_script_template(process_template)) + outfile.close() + + +def main(args = sys.argv): + """This is the main body of the program that by default uses the arguments +from the command line.""" + + parser = parser_setup() + poptions = parser.parse_args() + + if poptions.quiet: + logging.basicConfig(level=logging.WARNING, format=log_format) + elif poptions.debug: + logging.basicConfig(level=logging.DEBUG, format=log_format) + else: + # Set up the logging levels + logging.basicConfig(level=logging.INFO, format=log_format) + logging.getLogger("requests").setLevel(logging.WARNING) + + if not poptions.base_api_url and "LIMS_API_URL" in os.environ: + api_url = os.environ["LIMS_API_URL"] + elif poptions.base_api_url: + api_url = poptions.base_api_url + else: + logging.error("Could not find LIMS API URL.\n") + sys.exit(1) + + if not poptions.token and "LIMS_API_TOKEN" in os.environ: + token = os.environ["LIMS_API_TOKEN"] + elif poptions.token: + token = poptions.token + else: + logging.error("Could not find LIMS API TOKEN.\n") + sys.exit(1) + + process = ProcessSetUp(poptions, api_url, token) + + #process.setup_alignments(poptions.align_ids) + + if poptions.flowcell_label: + process.setup_flowcell(poptions.flowcell_label) + else: + logging.critical("Non-flowcell setup not yet supported") + + #if poptions.tag: + # process.setup_tag(poptions.tag) + + #if poptions.project: + # process.setup_project(poptions.project) + + +# This is the main body of the program that only runs when running this script +# doesn't run when imported, so you can use the functions above in the shell after importing +# without automatically running it +if __name__ == "__main__": + main() From d8a74cd323ea593c8eb229f6ae5760ed0fd093df Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 14:55:04 -0700 Subject: [PATCH 084/172] Add apptainer bind mounts --- processes/altcode/nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/processes/altcode/nextflow.config b/processes/altcode/nextflow.config index 32d6ada9..917b577d 100644 --- a/processes/altcode/nextflow.config +++ b/processes/altcode/nextflow.config @@ -12,4 +12,5 @@ process { apptainer { enabled = true cacheDir = "$HOME/.apptainer_nextflow_cache" + runOptions = "--bind /net/seq/data2/,/net/seq/data/,${baseDir}" } From 0766e469ca9323409006696be9e403e64eca136f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 9 Aug 2023 14:56:49 -0700 Subject: [PATCH 085/172] Fix typo in process name --- processes/altcode/altcode.nf | 4 ++-- processes/altcode/nextflow.config | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 6387430a..15d84499 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -26,7 +26,7 @@ workflow { ) STAR_solo.out.solo_analysis - | convert_to_hda5 + | convert_to_h5ad } @@ -125,7 +125,7 @@ process STAR_solo { """ } -process convert_to_hda5 { +process convert_to_h5ad { cpus 10 memory "10 GB" publishDir params.outdir, mode: "copy" diff --git a/processes/altcode/nextflow.config b/processes/altcode/nextflow.config index 917b577d..d5619377 100644 --- a/processes/altcode/nextflow.config +++ b/processes/altcode/nextflow.config @@ -4,7 +4,7 @@ process { withName: STAR_solo { module = [ 'samtools/1.14' ] } - withName: convert_to_hda5 { + withName: convert_to_h5ad { container = "file://${baseDir}/scanpy.sif" } } From aa10ad006d0e02df9dba72234099b074e5ffd64b Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 11 Sep 2023 13:38:53 -0700 Subject: [PATCH 086/172] Altseq is stable, remove alpha label --- processes/altseq/process_altseq.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index d463e020..2f96af71 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -10,7 +10,7 @@ done set -eo pipefail -version=1.1.0-alpha4 +version=1.1.0 cd "$(dirname "$0")" From f34085cbdc6b11329f9030ba14f751f3876eb541 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 11 Sep 2023 15:15:00 -0700 Subject: [PATCH 087/172] Fix - copy SampleSheet*csv in run_bcl2fastq_2.sh This fixes alt-code flowcell processing. --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 6f9a52ef..7a7b25a0 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -673,7 +673,7 @@ $link_command mkdir -p "$analysis_dir" rsync -avP "$illumina_dir/InterOp" "$analysis_dir/" rsync -avP "$illumina_dir/RunInfo.xml" "$analysis_dir/" -rsync -avP "$samplesheet" "$analysis_dir" +rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" # Copy each sample by itself, checking to see if we have a project_share_directory set # This is very important to keep customer data separate from internal data. From ad5bd215d4cb95d2550134ccc875fef63c78cecb Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Wed, 13 Sep 2023 09:30:38 -0700 Subject: [PATCH 088/172] Fixes and QoL for make_samplesheets.py Fixes a bug with altcode flowcells with 2 barcodes Adds a new --filename option --- scripts/flowcells/make_samplesheets.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index 045f8501..d2fa8306 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -9,6 +9,10 @@ from collections import defaultdict +# requires BioPython which seems to be in our environment +# but only to reverse complement which we could figure out +# another way to do +from Bio.Seq import Seq # Usage: $0 -p processing.json @@ -16,6 +20,7 @@ "processing": "processing.json", "reverse_barcode1": False, "reverse_barcode2": False, + "filename": "SampleSheet.withmask.{mask}.csv", } def parser_setup(): @@ -26,6 +31,8 @@ def parser_setup(): help="Use reverse sequence for barcode1") parser.add_argument("--reverse_barcode2", dest="reverse_barcode2", action="store_true", help="Use reverse sequence for barcode2") + parser.add_argument("--filename", + help="The template to use for filename, with the {mask} formatting") parser.set_defaults(**SCRIPT_OPTIONS) return parser @@ -68,12 +75,25 @@ def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2 pool_assignment_set.add( (libdata.get("lane"), *libs_to_pools[lib_num]) ) + + # a quick little inner function to reverse complement + # a sequence and return the string of that + def reverse_complement(sequence: str) -> str: + seq = Seq(sequence) + return str(seq.reverse_complement()) + # Turn set of tuples into list of dicts pool_assignments = [{ "lane": a[0], "sample": a[1], - "barcode1": a[2], - "barcode2": a[3], + # Okay so we're trying to do the same with these as we do with + # the library barcodes including following the reverse instructions + # and these come reversed in the processing.json + # eventually we might want to change the processing.json to have both versions at hand + # like we do for libraries + # and remove the dependency on biopython + "barcode1": a[2] if reverse_barcode1 else reverse_complement(a[2]), + "barcode2": a[3] if reverse_barcode2 else reverse_complement(a[3]), } for a in pool_assignment_set] return assignments + pool_assignments @@ -182,7 +202,7 @@ def adjust_mask_for_lengths(mask_parts, len1, len2): return new_mask -def write_samplesheets(name, date, root_mask, assignments): +def write_samplesheets(name, filename_template, date, root_mask, assignments): """ Write out the sample sheets """ mask_parts = parse_mask(root_mask) max_bclen1 = 0 @@ -210,7 +230,8 @@ def write_samplesheets(name, date, root_mask, assignments): header = make_samplesheet_header(name, date) body = make_samplesheet_body(assigns) samplesheet_contents = header + body - filename = "SampleSheet.withmask.{}.csv".format(mask_to_str(new_mask)) + filename = filename_template.format(mask=mask_to_str(new_mask)) + print("Writing {filename} with {new_mask}".format(filename=filename, new_mask=mask_to_str(new_mask))) with open(filename, "w") as f: f.write(samplesheet_contents) @@ -243,6 +264,7 @@ def main(args=sys.argv): ) mask = data["alignment_group"]["bases_mask"] write_samplesheets(name="Altius", + filename_template=poptions.filename, date=str(datetime.date.today()), root_mask=mask, assignments=assignments) From 7cab837de2e19f501c740766303724f9758bbfff Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 21 Sep 2023 16:35:09 -0700 Subject: [PATCH 089/172] Altcode - fetch additional metadata --- scripts/poolprocess.py | 252 ++++++++++++++++++++++++++++++++++------- 1 file changed, 210 insertions(+), 42 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index e037bf9e..3409efab 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,3 +1,4 @@ +#import csv import json import os import sys @@ -12,11 +13,16 @@ except ImportError: from futures import ThreadPoolExecutor -log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +# Globals for storing our mapping (saves LIMS hits) +POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} +LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} +LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} + +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') -script_options = { +SCRIPT_OPTIONS = { "quiet": False, "debug": False, "base_api_url": None, @@ -39,53 +45,55 @@ def parser_setup(): parser = argparse.ArgumentParser() parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") + help="Don't print info messages to standard out.") parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") + help="Print all debug messages to standard out.") parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") + help="The base API url, if not the default live LIMS.") parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") + help="Your authentication token. Required.") #parser.add_argument("--alignment", dest="align_ids", type=int, action="append", # help="Run for this particular alignment.") parser.add_argument("--flowcell", dest="flowcell_label", - help="Run for this particular flowcell label.") + help="Run for this particular flowcell label.") parser.add_argument("--pool", dest="pool", - help="Run for this particular pool.") + help="Run for this particular pool.") #parser.add_argument("--tag", dest="tag", # help="Run for alignments tagged here.") #parser.add_argument("--project", dest="project", # help="Run for alignments in this project.") parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") + help="The script template to use.") parser.add_argument("--qsub_priority", dest="qsub_priority", type=int, - help="The priority to give scripts we are submitting.") + help="The priority to give scripts we are submitting.") parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") + help="Append commands to run this alignment to this file.") parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") + help="Name of the script that goes after the sample name.") parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") parser.add_argument("--qsub-queue", dest="qsub_queue", - help="Name of the SLURM partition") + help="Name of the SLURM partition") parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") + help="Take no action, only print messages.") parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="Don't use any barcode mask.") - parser.add_argument("--redo_completed", dest="redo_completed", help="Redo alignments marked as completed.", - action="store_true") + help="Don't use any barcode mask.") + parser.add_argument("--redo_completed", dest="redo_completed", action="store_true", + help="Redo alignments marked as completed.") #parser.add_argument("--auto_aggregate", dest="auto_aggregate", help="Script created will also run auto-aggregations after alignments finished.", #action="store_true") - parser.add_argument("--align_base_dir", dest="align_base_dir", help="Create the alignment directory in this directory") + parser.add_argument("--align_base_dir", dest="align_base_dir", + help="Create the alignment directory in this directory") - parser.add_argument("--listout", dest="simple_output", help="Write only a list of alignments to run, rather than a script to submit them", action="store_true") + parser.add_argument("--listout", dest="simple_output", action="store_true", + help="Write only a list of alignments to run, rather than a script to submit them") - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.set_defaults(**SCRIPT_OPTIONS) + parser.set_defaults(quiet=False, debug=False) return parser @@ -182,12 +190,21 @@ def get_process_template(self, align_id, process_template_id): return info # Run alignment setup in parallel - def setup_alignments(self, align_ids): - for id, error in self.pool.map(self.setup_alignment, align_ids): - if error: - logging.debug("ALN%d result received, error: %s" % (id, error)) - else: - logging.debug("ALN%d result received, OK" % id) + def setup_alignments(self, align_ids, parallel=True): + all_okay = True + if parallel: + for id, error in self.pool.map(self.setup_alignment, align_ids): + if error: + logging.error("ALN%d result received, error: %s" % (id, error)) + all_okay = False + else: + logging.debug("ALN%d result received, OK" % id) + if not all_okay: + logging.critical("Errors during setup, exiting") + # Sequential version, helpful for debugging + else: + for aln_id in align_ids: + self.setup_alignment(aln_id) def setup_alignment(self, align_id): @@ -245,9 +262,13 @@ def extract_id_from_url(url): return int(re.findall(r'\d+', url)[-1]) # Storage for the 3 layers of mapping between alignments and pools - pool_key_to_lib_ids = defaultdict(list) # {(pool_id, lane_number): [lib_id]} - lib_id_to_lane_ids = defaultdict(list) # {lib_id: [lane_ids]} - lane_id_to_aln_ids = defaultdict(list) # {lane_id: [aln_ids]} + global POOL_KEY_TO_LIB_IDS + global LIB_ID_TO_LANE_IDS + global LANE_ID_TO_ALN_IDS + + POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} + LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} + LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} library_info = set() for lane in self.api_list_result("flowcell_lane/?flowcell__label=%s&page_size=1000" % flowcell_label): @@ -255,7 +276,7 @@ def extract_id_from_url(url): lane_lane = lane['lane'] library_info.add((lib_url, lane_lane)) lib_id = extract_id_from_url(lib_url) - lib_id_to_lane_ids[lib_id].append(lane['id']) + LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) # Set of poolnums + lane pool_info = set() @@ -264,19 +285,19 @@ def extract_id_from_url(url): for pool in lib_info['librarypools']: pool_info.add((pool["number"], info[1])) key = (pool["id"], info[1]) - pool_key_to_lib_ids[key].append(lib_info['id']) + POOL_KEY_TO_LIB_IDS[key].append(lib_info['id']) all_alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) for aln in all_alignments: lane_id = extract_id_from_url(aln['lane']) - lane_id_to_aln_ids[lane_id].append(aln['id']) + LANE_ID_TO_ALN_IDS[lane_id].append(aln['id']) # Find the minimum alignment ID for each pool/lane combination - lowest_aln_for_pool = {pool_key: None for pool_key in pool_key_to_lib_ids.keys()} - for (pool_key, lib_ids) in pool_key_to_lib_ids.items(): + lowest_aln_for_pool = {pool_key: None for pool_key in POOL_KEY_TO_LIB_IDS.keys()} + for (pool_key, lib_ids) in POOL_KEY_TO_LIB_IDS.items(): for lib_id in lib_ids: - for lane_id in lib_id_to_lane_ids[lib_id]: - for aln_id in lane_id_to_aln_ids[lane_id]: + for lane_id in LIB_ID_TO_LANE_IDS[lib_id]: + for aln_id in LANE_ID_TO_ALN_IDS[lane_id]: cur_aln = lowest_aln_for_pool[pool_key] logging.debug("%s, %d, %d, %d < %d?", pool_key, lib_id, lane_id, aln_id, cur_aln) @@ -365,7 +386,7 @@ def create_script(self, processing_info, align_id): logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) return False - lib_info = self.api_single_result("library/?number=%d" % processing_info["libraries"][0]["library"])["results"][0] + lib_info = self.api_single_result("library/?number__in=%d" % processing_info["libraries"][0]["library"])["results"][0] logging.debug("lib info is %s", lib_info) pool_name = lib_info["librarypools"][0]["object_name"] logging.debug("pool is %s", pool_name) @@ -495,6 +516,153 @@ def create_script(self, processing_info, align_id): outfile.write(self.get_script_template(process_template)) outfile.close() + # Create the config file as well + self.create_sample_config(processing_info, alignment, script_directory) + + def create_sample_config(self, processing_info, alignment, script_directory): + alignment_id = int(alignment["id"]) + + def get_libraries_in_pool(alignment_id): + + # Get all lane ids + # Go up to the pool then down to the lanes + # Note: This is inefficient but probably doesnt matter in practice + lanes = [] + lanes_with_align = set() + for (lane_id, aln_ids) in LANE_ID_TO_ALN_IDS.items(): + if alignment_id in aln_ids: + lanes_with_align.add(lane_id) + assert len(lanes_with_align) == 1, "Alignment must have exactly 1 lane" + align_lane_id = lanes_with_align.pop() + + libs_with_align = set() + for (lib_id, lane_ids) in LIB_ID_TO_LANE_IDS.items(): + if align_lane_id in lane_ids: + libs_with_align.add(lib_id), "Lane must have exactly 1 library" + assert len(libs_with_align) == 1 + align_lib_id = libs_with_align.pop() + + pools_with_align = set() + for (pool_key, lib_ids) in POOL_KEY_TO_LIB_IDS.items(): + if align_lib_id in lib_ids: + pools_with_align.add(pool_key) + # TODO: This is broken because the pool can be in more than one lane!!! + assert len(pools_with_align) == 1, "Lib must have exactly one pool" + align_poolkey = pools_with_align.pop() + + library_ids = set(POOL_KEY_TO_LIB_IDS[align_poolkey]) + return library_ids + + lib_ids = get_libraries_in_pool(alignment_id) + + def build_library_info(lib_id, flowcell_label): + # FIXME: This route doesn't work right now for some reason + #lib_info = self.api_single_result("library/%d/" % lib_id) + lib_info = self.api_list_result("library/?number__in=%d" % lib_id)[0] + barcode = "" + bc1 = lib_info["barcode1__sequence"] + bc2 = lib_info["barcode2__sequence"] + if bc1 is not None: + barcode += bc1 + if bc2 is not None: + barcode += bc2 + + sample_info = self.api_single_result(url=lib_info["sample"]) + tc_info = self.api_single_result(url=sample_info["tissue_culture"]) + project_info = self.api_single_result(url=sample_info["project"]) + + taggedobject_infos = self.api_list_result("tagged_object/?object_id=%d&content_type=%d" + % (lib_info["id"], lib_info["object_content_type"])) + cycle = None + for taggedobject_info in taggedobject_infos: + # TODO: It may be better to check membership in the Insights tag + if taggedobject_info["tag_slug"].startswith("megamap-run-mmap"): + if cycle is None: + tag_slug = str(taggedobject_info["tag_slug"]) + match = re.search(r"\d+$", tag_slug) + if match: + cycle = int(match.group()) + else: + logging.error("problem tag slug is '%s'" % tag_slug) + else: + logging.warning("Multiple megamap tags for LN%d", lib_info["number"]) + + def build_effector_info(effectortopool): + eff = effectortopool["assemble_effector"] + return { + "chromosome": eff["chromosome"], + "start": eff["start"], + "end": eff["end"], + "strand": eff["strand"], + "working_name": eff["working_name"], + + "n_terminus": { + "name": eff["effector__n_terminus__name"], + "nucleotide": eff["effector__n_terminus__nucleotide"], + "functional_domain": eff["effector__n_functional_domain__name"], + }, + "c_terminus": { + "name": eff["effector__c_terminus__name"], + "nucleotide": eff["effector__c_terminus__nucleotide"], + "functional_domain": eff["effector__c_functional_domain__name"], + }, + "concentration": eff["concentration"], + "ratio_260to280": eff["ratio_260to280"], + "ivt_mrna_concentration": eff["ivt_mrna_concentration"], + "repeat_array__target_recognition_sequence": "CTCTTTCACAGCTCGCG", + "target_recognition_sequence": "TCTCTTTCACAGCTCGCGT", + "wells": [ + { + "plate_name": well["plate__name"], + "plane_id": well["plate_id"], + "well": well["label"], + } + for well in eff["plate_wells"] + ] + } + + pool_info = [] + for effector_pool in tc_info["effector_pools"]: + effector_pool_info = self.api_single_result(url=effector_pool["url"]) + pool_info.append({ + "effector_pool": effector_pool_info["object_name"], + "name": effector_pool_info["name"], + "purpose": effector_pool_info["purpose__name"], + "effectors": [ + build_effector_info(efftopool) + for efftopool in effector_pool_info["effectortopool_set"] + ], + }) + + info = { + "barcode": barcode, + "library": "LN%d" % lib_info["number"], + "sublibrary": lib_info["sub_library"], + "sample": "DS%d" % lib_info["sample_number"], + "tc": "TC%d" % tc_info["number"], + "cell_type": tc_info["sample_taxonomy__name"], + "project": project_info["name"], + "flowcell": flowcell_label, + "cycle": cycle, + "effector_pools": pool_info, + } + return info + + + flowcell_label = "FC%s" % processing_info["flowcell"]["label"] + libraries = [] + for lib_id in lib_ids: + libraries.append(build_library_info(lib_id, flowcell_label)) + + data = {"libraries": libraries} + # do stuff + with open("%s/pool_info.json" % script_directory, "w") as out: + json.dump(data, out, indent=2, sort_keys=True) + #writer = csv.DictWriter(out, fieldnames=fieldnames, dialect="excel-tab", restval="") + #writer.writeheader() + #for row in rows: + #writer.writerow(row) + def main(args = sys.argv): """This is the main body of the program that by default uses the arguments @@ -504,12 +672,12 @@ def main(args = sys.argv): poptions = parser.parse_args() if poptions.quiet: - logging.basicConfig(level=logging.WARNING, format=log_format) + logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT) elif poptions.debug: - logging.basicConfig(level=logging.DEBUG, format=log_format) + logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) else: # Set up the logging levels - logging.basicConfig(level=logging.INFO, format=log_format) + logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logging.getLogger("requests").setLevel(logging.WARNING) if not poptions.base_api_url and "LIMS_API_URL" in os.environ: From 276ceda76f0306ae5d9dfaff07a901473ac9ae3c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 24 Sep 2023 15:15:12 -0700 Subject: [PATCH 090/172] Altcode - embed metadata in h5ad files --- processes/altcode/altcode.nf | 6 ++++-- processes/altcode/bin/mtx_to_h5.py | 30 ++++++++++++++++++++++++-- processes/altcode/process_altcode.bash | 1 + 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 15d84499..44710aa4 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -1,6 +1,7 @@ nextflow.enable.dsl=2 params.outdir = "output" +params.metadata = "" /// Workflows @@ -26,6 +27,7 @@ workflow { ) STAR_solo.out.solo_analysis + | map { [it[0], it[1], file(params.metadata)] } | convert_to_h5ad } @@ -131,7 +133,7 @@ process convert_to_h5ad { publishDir params.outdir, mode: "copy" input: - tuple(val(meta), path(directory)) + tuple(val(meta), path(directory), path(metadata)) output: tuple(val(meta), path(directory)) @@ -142,7 +144,7 @@ process convert_to_h5ad { for dir_name in $(find -L "!{directory}" -name matrix.mtx.gz \ | grep -v "SJ/raw" \ | xargs --no-run-if-empty dirname) ; do - (mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" || kill 0 ) & + (mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" --metadata "!{metadata}" || kill 0 ) & done wait ''' diff --git a/processes/altcode/bin/mtx_to_h5.py b/processes/altcode/bin/mtx_to_h5.py index 728b4c06..cf3e8d0b 100755 --- a/processes/altcode/bin/mtx_to_h5.py +++ b/processes/altcode/bin/mtx_to_h5.py @@ -2,6 +2,7 @@ import argparse import logging +import json import scanpy as sc @@ -10,13 +11,31 @@ def parser_setup(): parser = argparse.ArgumentParser() parser.add_argument("mtx_directory", help="the directory containing the mtx files") parser.add_argument("output", help="the name of the output file") + parser.add_argument("--metadata", help="A JSON-formatted file of metadata to add") parser.add_argument("--compress", action="store_true", help="Compress output with gzip") return parser -def convert(input_dir, output_file, compress=False): +def lists_to_dicts(data): + # Recursively converts lists to dicts + # Required because of this issue https://github.com/scverse/anndata/issues/708 + if isinstance(data, list): + return { + f"_{idx}": lists_to_dicts(elem) + for idx, elem in enumerate(data) + } + if isinstance(data, dict): + for key in list(data.keys()): + data[key] = lists_to_dicts(data[key]) + return data + +def convert(input_dir, output_file, compress=False, metadata=None): data = sc.read_10x_mtx(input_dir, cache=False) + if metadata is not None: + # We store it two different ways because AnnData does not support lists-of-dicts + data.uns["metadata_json"] = json.dumps(metadata) + data.uns["metadata"] = lists_to_dicts(metadata) comp_method = "gzip" if compress else None data.write(filename=output_file, compression=comp_method) @@ -26,7 +45,14 @@ def main(): logging.warning( "output file extension is not '.h5ad', some programs may fail to read it" ) - convert(poptions.mtx_directory, poptions.output, poptions.compress) + + if poptions.metadata: + with open(poptions.metadata) as m: + metadata = json.load(m) + else: + metadata = None + + convert(poptions.mtx_directory, poptions.output, poptions.compress, metadata=metadata) if __name__ == "__main__": main() diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index dffc7a3d..7d67828d 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -50,6 +50,7 @@ WORKDIR=$WORKROOT/$USER/altseq/FC$FLOWCELL/work/ params=params.yaml cat >$params < Date: Sun, 1 Oct 2023 13:24:24 -0700 Subject: [PATCH 091/172] Altcode metadata - get lentitale info from TC --- processes/altcode/process_altcode.bash | 2 +- scripts/poolprocess.py | 60 ++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 7d67828d..00f8e797 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha1 +version=1.0.0-alpha2 cd "$(dirname "$0")" # Temporarily hardcoded! diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 3409efab..97cf5358 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -58,8 +58,8 @@ def parser_setup(): # help="Run for this particular alignment.") parser.add_argument("--flowcell", dest="flowcell_label", help="Run for this particular flowcell label.") - parser.add_argument("--pool", dest="pool", - help="Run for this particular pool.") + #parser.add_argument("--pool", dest="pool", + # help="Run for this particular pool.") #parser.add_argument("--tag", dest="tag", # help="Run for alignments tagged here.") #parser.add_argument("--project", dest="project", @@ -299,11 +299,15 @@ def extract_id_from_url(url): for lane_id in LIB_ID_TO_LANE_IDS[lib_id]: for aln_id in LANE_ID_TO_ALN_IDS[lane_id]: cur_aln = lowest_aln_for_pool[pool_key] - logging.debug("%s, %d, %d, %d < %d?", + logging.debug("%s, %d, %d, %d < %s?", pool_key, lib_id, lane_id, aln_id, cur_aln) if cur_aln is None or cur_aln > aln_id: lowest_aln_for_pool[pool_key] = aln_id + logging.debug("POOL_KEY_TO_LIB_IDS %s", POOL_KEY_TO_LIB_IDS) + logging.debug("LIB_ID_TO_LANE_IDS %s", LIB_ID_TO_LANE_IDS) + logging.debug("LANE_ID_TO_ALN_IDS %s", LANE_ID_TO_ALN_IDS) + logging.debug("ALN IDS %s", lowest_aln_for_pool.values()) return list(lowest_aln_for_pool.values()) @@ -364,7 +368,8 @@ def get_script_template(self, process_template): return open(script_path, 'r').read() def create_script(self, processing_info, align_id): - + logging.debug("Creating script for ALN%d", align_id) + assert len(processing_info["libraries"]) == 1 lane = processing_info["libraries"][0] alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] @@ -386,7 +391,9 @@ def create_script(self, processing_info, align_id): logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) return False - lib_info = self.api_single_result("library/?number__in=%d" % processing_info["libraries"][0]["library"])["results"][0] + lib_info_response = self.api_single_result("library/?number=%d" % lane["library"])["results"] + assert len(lib_info_response) == 1 + lib_info = lib_info_response[0] logging.debug("lib info is %s", lib_info) pool_name = lib_info["librarypools"][0]["object_name"] logging.debug("pool is %s", pool_name) @@ -521,6 +528,7 @@ def create_script(self, processing_info, align_id): def create_sample_config(self, processing_info, alignment, script_directory): alignment_id = int(alignment["id"]) + logging.debug("Creating sample config for ALN%d", alignment_id) def get_libraries_in_pool(alignment_id): @@ -549,16 +557,17 @@ def get_libraries_in_pool(alignment_id): # TODO: This is broken because the pool can be in more than one lane!!! assert len(pools_with_align) == 1, "Lib must have exactly one pool" align_poolkey = pools_with_align.pop() + logging.debug("Alignment ALN%d - poolkey %s", alignment_id, align_poolkey) library_ids = set(POOL_KEY_TO_LIB_IDS[align_poolkey]) + logging.debug("Lib IDs in poolkey %s: %s", align_poolkey, library_ids) return library_ids lib_ids = get_libraries_in_pool(alignment_id) def build_library_info(lib_id, flowcell_label): # FIXME: This route doesn't work right now for some reason - #lib_info = self.api_single_result("library/%d/" % lib_id) - lib_info = self.api_list_result("library/?number__in=%d" % lib_id)[0] + lib_info = self.api_single_result("library/%d/" % lib_id) barcode = "" bc1 = lib_info["barcode1__sequence"] bc2 = lib_info["barcode2__sequence"] @@ -576,7 +585,7 @@ def build_library_info(lib_id, flowcell_label): cycle = None for taggedobject_info in taggedobject_infos: # TODO: It may be better to check membership in the Insights tag - if taggedobject_info["tag_slug"].startswith("megamap-run-mmap"): + if taggedobject_info["tag_slug"].startswith("megamap-run-mmap") or taggedobject_info["tag_slug"].startswith("epicapdev-run-ecd"): if cycle is None: tag_slug = str(taggedobject_info["tag_slug"]) match = re.search(r"\d+$", tag_slug) @@ -585,7 +594,7 @@ def build_library_info(lib_id, flowcell_label): else: logging.error("problem tag slug is '%s'" % tag_slug) else: - logging.warning("Multiple megamap tags for LN%d", lib_info["number"]) + logging.warning("Multiple tags for LN%d", lib_info["number"]) def build_effector_info(effectortopool): eff = effectortopool["assemble_effector"] @@ -634,12 +643,45 @@ def build_effector_info(effectortopool): ], }) + def extract_lenti_from_tc_notes(notes): + def match_notes(regex): + match = re.search(regex, notes, re.MULTILINE | re.IGNORECASE) + if match is None: + return None + return match.group(1) + # Example notes field below: + # Talen Number: TL120935 + # Original TALE name: IL2RA-TL52068-Z + # LentiTALE: Lenti-KRAB + # MOI Estimate: 1.4 + # Virus volume: 100 + # Lenti-X Content: 15% + talen_number = match_notes(r"Talen Number: (TL\d+)\s*") + talen_name = match_notes(r"Original TALE name: (.+?)\s*$") + lentitale = match_notes(r"LentiTALE: (.+?)\s*$") + moi_estimate = match_notes(r"MOI Estimate: (.+?)\s*$") + virus_volume = match_notes(r"Virus volume: (\d+)\s*$") + lenti_x_content = match_notes(r"Lenti-X Content: (.+?)\s*$") + + return { + "talen_number": talen_number, + "talen_name": talen_name, + "lentitale": lentitale, + "moi_estimate": moi_estimate, + "virus_volume": virus_volume, + "lenti_x_content": lenti_x_content, + } + info = { "barcode": barcode, + "barcode1": bc1, + "barcode2": bc2, "library": "LN%d" % lib_info["number"], "sublibrary": lib_info["sub_library"], "sample": "DS%d" % lib_info["sample_number"], "tc": "TC%d" % tc_info["number"], + "tc_notes": tc_info["notes"], + "lentitale_from_tc_notes": extract_lenti_from_tc_notes(tc_info["notes"]), "cell_type": tc_info["sample_taxonomy__name"], "project": project_info["name"], "flowcell": flowcell_label, From 96e9044128013e6b588f9a689f95a7d5a56c8374 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 3 Oct 2023 10:32:18 -0700 Subject: [PATCH 092/172] Add Agilent Agent adapter trimming --- modules/adapter_trimming.nf | 20 ++++++++++++++++++++ processes/rna-star/process_rna_star_only.sh | 8 +++++++- processes/rna-star/star_alignment.nf | 6 ++++-- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/modules/adapter_trimming.nf b/modules/adapter_trimming.nf index d85c3aae..c89c6e0e 100644 --- a/modules/adapter_trimming.nf +++ b/modules/adapter_trimming.nf @@ -32,6 +32,26 @@ process fastp_adapter_trim { """ } +process agent_adapter_trim { + cpus 1 + scratch false + module "jdk/11.0.16" + + input: + tuple path(r1), path(r2) + + output: + path 'output_R?.fastq.gz', emit: fastq + + script: + """ + agent.sh trim -v2 \ + -fq1 "${r1}" \ + -fq2 "${r2}" \ + -out \$PWD/output + """ +} + /// Our custom in-house adapter-trimming script process adapter_trim { cpus 3 diff --git a/processes/rna-star/process_rna_star_only.sh b/processes/rna-star/process_rna_star_only.sh index 5a416003..cdb73f75 100755 --- a/processes/rna-star/process_rna_star_only.sh +++ b/processes/rna-star/process_rna_star_only.sh @@ -8,6 +8,12 @@ if [[ "$LIBRARY_KIT" == "Smarter Stranded Total v3 Pico RNASeq with RNA Isolatio UMI_METHOD=takara-umt fi +ADAPTER_FLAG=--use_fastp +if [[ "$LIBRARY_KIT" == "Agilent Sure Select XT HS2" ]] ; then + ADAPTER_FLAG=--use_agent +fi + + source "$MODULELOAD" module load openssl-dev jdk nextflow source "$PYTHON3_ACTIVATE" @@ -38,7 +44,7 @@ NXF_VER=21.04.1 nextflow run \ --outdir "$OUT_DIR" \ --adapter_p5 "$ADAPTER_P5" \ --adapter_p7 "$ADAPTER_P7" \ - --use_fastp \ + "$ADAPTER_FLAG" \ -profile modules,cluster \ -resume \ "$@" diff --git a/processes/rna-star/star_alignment.nf b/processes/rna-star/star_alignment.nf index e7ecb808..7763da81 100644 --- a/processes/rna-star/star_alignment.nf +++ b/processes/rna-star/star_alignment.nf @@ -7,6 +7,7 @@ params.r1 = null params.r2 = null params.use_fastp = false +params.use_agent = false params.adapter_p5 = null params.adapter_p7 = null params.readlength = null @@ -18,7 +19,7 @@ params.star_threads = 8 include { star } from "./modules/star.nf" addParams(publish: false) -include { adapter_trim; fastp_adapter_trim } from "../../modules/adapter_trimming.nf" +include { adapter_trim; fastp_adapter_trim; agent_adapter_trim } from "../../modules/adapter_trimming.nf" include { move_umt; takara_trim_umt } from "../../modules/umt.nf" include { publish; publish_and_rename } from "../../modules/utility.nf" include { encode_cram; encode_cram_no_ref } from "../../modules/cram.nf" addParams(cram_write_index: false ) @@ -43,8 +44,9 @@ workflow STAR_ALIGNMENT { ref_files = file("${params.starIndexDir}/*") if (params.use_fastp) { - println "DEBUG: Using fastp" fastp_adapter_trim( [params.r1, params.r2, params.adapter_p5, params.adapter_p7] ).fastq.set { trimmed_fastq } + } else if (params.use_agent) { + agent_adapter_trim( [params.r1, params.r2] ).fastq.set { trimmed_fastq } } else { adapter_trim( [params.r1, params.r2, params.adapter_p5, params.adapter_p7] ).fastq.set { trimmed_fastq } } From fba0294a7cf3231629b1cc25531013f9ae2c77e9 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 4 Oct 2023 15:26:51 -0700 Subject: [PATCH 093/172] Altcode - save more multimapper variants. --- processes/altcode/altcode.nf | 42 ++++++++++++++------- processes/altcode/bin/compress_mtx_files.sh | 2 +- scripts/altcode/upload_fastq.py | 2 +- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 44710aa4..e7ed7c4e 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -9,6 +9,20 @@ params.metadata = "" /// Processes a single sample workflow { + def find_matrices_in_dir = { m, dir -> + def mtx_files = [] + dir.eachFileRecurse { f -> + // Find mtx.gz files, excluding the SJ folder + if (f.name.endsWith(".mtx.gz") && !(f.parent.parent.name == "SJ")) { + def parent_path = f.parent + def relative_path = dir.parent.toUri().relativize( parent_path.toUri() ).toString() + // Record the file, barcode file, feature file, and the relative path to store the output in + mtx_files.push([m, file(params.metadata), f, file("${f.parent}/barcodes.tsv.gz"), file("${f.parent}/features.tsv.gz"), relative_path]) + } + } + return mtx_files + } + def meta = [:] def ref_files = file("${params.genome_dir}/*") @@ -27,7 +41,7 @@ workflow { ) STAR_solo.out.solo_analysis - | map { [it[0], it[1], file(params.metadata)] } + | flatMap { find_matrices_in_dir(it[0], it[1]) } | convert_to_h5ad } @@ -102,6 +116,7 @@ process STAR_solo { --soloCBmatchWLtype 1MM \ --soloUMIdedup 1MM_All \ --soloFeatures Gene GeneFull SJ GeneFull_Ex50pAS GeneFull_ExonOverIntron \ + --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --runThreadN "${num_threads}" \ --limitBAMsortRAM "${bam_sort_RAM}" \ --outSAMtype BAM Unsorted \ @@ -128,24 +143,25 @@ process STAR_solo { } process convert_to_h5ad { - cpus 10 - memory "10 GB" - publishDir params.outdir, mode: "copy" + cpus 1 + memory "2 GB" + publishDir params.outdir, mode: "copy", saveAs: {f -> "$out_dir/$f"} input: - tuple(val(meta), path(directory), path(metadata)) + tuple(val(meta), path(metadata_file), path(matrix), path(barcodes), path(features), val(out_dir)) output: - tuple(val(meta), path(directory)) + tuple(val(meta), path(out_file)) shell: + out_file = "${matrix.simpleName}.h5ad" + // scanpy requires specific file names ''' - set -m - for dir_name in $(find -L "!{directory}" -name matrix.mtx.gz \ - | grep -v "SJ/raw" \ - | xargs --no-run-if-empty dirname) ; do - (mtx_to_h5.py "$dir_name" "$dir_name/matrix.h5ad" --metadata "!{metadata}" || kill 0 ) & - done - wait + mkdir -p tmp + cp "!{matrix}" tmp/matrix.mtx.gz + cp "!{barcodes}" tmp/barcodes.tsv.gz + cp "!{features}" tmp/features.tsv.gz + mtx_to_h5.py tmp "!{out_file}" --metadata "!{metadata_file}" + rm -r tmp ''' } diff --git a/processes/altcode/bin/compress_mtx_files.sh b/processes/altcode/bin/compress_mtx_files.sh index d71b194f..c3bad4c2 100755 --- a/processes/altcode/bin/compress_mtx_files.sh +++ b/processes/altcode/bin/compress_mtx_files.sh @@ -4,7 +4,7 @@ root_directory=${1:-.} threads=${2:-10} #shellcheck disable=SC2037 -name_query=( '(' -name matrix.mtx -o -name barcodes.tsv -o -name features.tsv ')' ) +name_query=( '(' -name '*.mtx' -o -name barcodes.tsv -o -name features.tsv ')' ) # Gzip regular files find "$root_directory" -type f \ diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index 38b26017..8648f9ea 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Uploads all the results of alt-seq processing to LIMS +Uploads alt-code fastq files to LIMS """ import pprint From 059a0370494f5849bf5fbaf9ec221658d02dc554 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 4 Oct 2023 16:47:23 -0700 Subject: [PATCH 094/172] Altcode: Add sample plate wells to metadata --- scripts/poolprocess.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 97cf5358..4e419aff 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -566,7 +566,6 @@ def get_libraries_in_pool(alignment_id): lib_ids = get_libraries_in_pool(alignment_id) def build_library_info(lib_id, flowcell_label): - # FIXME: This route doesn't work right now for some reason lib_info = self.api_single_result("library/%d/" % lib_id) barcode = "" bc1 = lib_info["barcode1__sequence"] @@ -672,6 +671,27 @@ def match_notes(regex): "lenti_x_content": lenti_x_content, } + def sample_plate_wells(sample_info): + def info_to_data(well_info): + match = re.match(r"(.*) ([A-Z0-9]{2})", well_info["object_name"]) + well_data = { + "plate_name": match.group(1), + "well_label": well_info["label"], + "plate_id": well_info["plate_id"], + "object_label": well_info["content_object_label"], + } + if well_info["parent"]: + parent_info = self.api_single_result(url=well_info["parent"]) + well_data["well_parent"] = info_to_data(parent_info) + return well_data + wells = [] + for well in sample_info["plate_wells"]: + well_info = self.api_single_result("plate_well/%d/" % well["id"]) + well_data = info_to_data(well_info) + wells.append(well_data) + return wells + + info = { "barcode": barcode, "barcode1": bc1, @@ -683,6 +703,7 @@ def match_notes(regex): "tc_notes": tc_info["notes"], "lentitale_from_tc_notes": extract_lenti_from_tc_notes(tc_info["notes"]), "cell_type": tc_info["sample_taxonomy__name"], + "sample_plate_wells": sample_plate_wells(sample_info), "project": project_info["name"], "flowcell": flowcell_label, "cycle": cycle, From 5d752bb640db76898577cd59528118acfeb4201a Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 4 Oct 2023 16:47:54 -0700 Subject: [PATCH 095/172] Fix for miniseq linking --- scripts/flowcells/link_nextseq.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index e027f062..295119b2 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -93,9 +93,8 @@ def create_links( ) short_name = re.sub(r"_", "-", short_name) - input_dir = input_basedir input_wildcard = os.path.join( - input_dir, "%s_S*_L00?_%s_???.fastq.gz" % (short_name, read) + input_basedir, "%s_S*_%s_???.fastq.gz" % (short_name, read) ) if not dry_run and not os.path.isdir(output_dir): @@ -163,7 +162,7 @@ def main(): lane_nums = set() for lib in info["libraries"]: lib_num = int(re.sub(r'[^\d]+', '', lib)) - lane_nums.update( libs_to_lanes[lib_num] ) + lane_nums.update(libs_to_lanes[lib_num]) for lane_num in sorted(lane_nums): out_name = "%s_%s_L00%d" % (pool, barcode, lane_num) From a42425c54739710eabd98ca5e90de26828909c61 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 9 Oct 2023 10:49:30 -0700 Subject: [PATCH 096/172] Alt-code: update to use 96-barcode file for R1 --- processes/altcode/process_altcode.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 00f8e797..a4f9dd60 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -55,7 +55,7 @@ r1: "$R1_FASTQ" r2: "$R2_FASTQ" genome_dir: "$STAR_GENOME_DIR" genome_fasta: "$GENOME_FA" -barcode_r1_list: "$STAMPIPES_DATA/altcode/barcodes_r1_v2_48.txt" +barcode_r1_list: "$STAMPIPES_DATA/altcode/barcodes_r1_v2_96.txt" barcode_r2_list: "$STAMPIPES_DATA/altcode/barcodes_r2.txt" barcode_r3_list: "$STAMPIPES_DATA/altcode/barcodes_r3.txt" barcode_r1_pos: $R1_BARCODE_POS From 1ae8c52e99c4d41f7e694cf5ae1b5fd4755638f4 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 9 Oct 2023 10:50:25 -0700 Subject: [PATCH 097/172] fix typo in altcode metadata --- scripts/poolprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 4e419aff..fef4ef1b 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -622,7 +622,7 @@ def build_effector_info(effectortopool): "wells": [ { "plate_name": well["plate__name"], - "plane_id": well["plate_id"], + "plate_id": well["plate_id"], "well": well["label"], } for well in eff["plate_wells"] From ba4ea668f875bbde09a57bf2009d98ff1e18e7a6 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 9 Oct 2023 10:50:59 -0700 Subject: [PATCH 098/172] Altcode - add polyA trimming and adjust resources --- processes/altcode/altcode.nf | 51 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index e7ed7c4e..fceb2808 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -68,8 +68,9 @@ def pos_to_str(start, length) { process STAR_solo { publishDir params.outdir, mode: "copy" - cpus 10 - memory "50 GB" + cpus 30 + memory "80 GB" + //scratch false input: tuple( @@ -100,32 +101,36 @@ process STAR_solo { r1_files = join_list_commas(r1) r2_files = join_list_commas(r2) - num_threads = 10 + num_threads = 30 """ set -o monitor mkfifo Aligned.out.bam (STAR \ - --genomeDir "ref" \ - --readFilesIn "${r1_files}" "${r2_files}" \ - --soloType CB_UMI_Complex \ - --soloCBposition "${bc3_position}" "${bc2_position}" "${bc1_position}" \ - --soloCBwhitelist "${r3_barcodes}" "${r2_barcodes}" "${r1_barcodes}" \ - --soloUMIposition "${umi_position}" \ - --soloCBmatchWLtype 1MM \ - --soloUMIdedup 1MM_All \ - --soloFeatures Gene GeneFull SJ GeneFull_Ex50pAS GeneFull_ExonOverIntron \ - --soloMultiMappers Unique PropUnique Uniform Rescue EM \ - --runThreadN "${num_threads}" \ - --limitBAMsortRAM "${bam_sort_RAM}" \ - --outSAMtype BAM Unsorted \ - --outSAMattributes NH HI AS nM CR CY UR UY sM \ - --outBAMcompression 0 \ - --outBAMsortingThreadN "${num_threads}" \ - --readFilesCommand zcat \ - --outFileNamePrefix ./ \ - --limitOutSJcollapsed 5000000 || kill 0) & + --soloCellReadStats Standard \ + --clip3pAdapterSeq AAAAAAAAAA \ + --clip3pAdapterMMp 0.1 \ + --genomeDir "ref" \ + --readFilesIn "${r1_files}" "${r2_files}" \ + --soloType CB_UMI_Complex \ + --soloCBposition "${bc3_position}" "${bc2_position}" "${bc1_position}" \ + --soloCBwhitelist "${r3_barcodes}" "${r2_barcodes}" "${r1_barcodes}" \ + --soloUMIposition "${umi_position}" \ + --soloCBmatchWLtype 1MM \ + --soloUMIdedup 1MM_All \ + --soloFeatures Gene GeneFull SJ GeneFull_Ex50pAS GeneFull_ExonOverIntron \ + --soloMultiMappers Unique PropUnique Uniform Rescue EM \ + --runThreadN "${num_threads}" \ + --limitBAMsortRAM "${bam_sort_RAM}" \ + --outSAMtype BAM Unsorted \ + --outSAMattributes NH HI AS nM CR CY UR UY sM \ + --outBAMcompression 0 \ + --outBAMsortingThreadN "${num_threads}" \ + --readFilesCommand zcat \ + --outFileNamePrefix ./ \ + --limitOutSJcollapsed 5000000 \ + || kill 0) & samtools sort \ --reference "${genome_fasta}" \ @@ -144,7 +149,7 @@ process STAR_solo { process convert_to_h5ad { cpus 1 - memory "2 GB" + memory "10 GB" publishDir params.outdir, mode: "copy", saveAs: {f -> "$out_dir/$f"} input: From 8e37f053ba17fe886dd9a7ba302b872622f73786 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 9 Oct 2023 10:52:33 -0700 Subject: [PATCH 099/172] altcode: version bump to 1.0.0-alpha3 --- processes/altcode/process_altcode.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index a4f9dd60..27e37e8c 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha2 +version=1.0.0-alpha3 cd "$(dirname "$0")" # Temporarily hardcoded! From d4a5f8104db59b9254168573a30ce392ffe2126d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 9 Oct 2023 10:59:59 -0700 Subject: [PATCH 100/172] Altcode - improve workdir handling & cleaning Should have committed this before the version bump probably, hindsight is 20/20. --- processes/altcode/process_altcode.bash | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 27e37e8c..6d8fe739 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -40,10 +40,11 @@ source "$STAMPIPES/scripts/sentry/sentry-lib.bash" WORKROOT=${WORKROOT:-/net/seq/scratch} if ! [[ -d "$WORKROOT" ]] ; then - echo "WORKROOT '$WORKROOT' does not exist, using '$PWD'" - WORKROOT=$PWD + echo "WORKROOT '$WORKROOT' does not exist, using '$PWD/work'" + WORKROOT=$PWD/work +else + WORKDIR=$WORKROOT/$USER/altseq/FC$FLOWCELL/ALN$ALIGNMENT_ID/work/ fi -WORKDIR=$WORKROOT/$USER/altseq/FC$FLOWCELL/work/ # Write parameter file @@ -100,3 +101,4 @@ echo | jq . > "$status_file" < Date: Sun, 15 Oct 2023 14:59:05 -0700 Subject: [PATCH 101/172] altcode: Generate stats and upload to LIMS --- processes/altcode/altcode.nf | 20 + processes/altcode/bin/summarize_stats.py | 146 +++++ processes/altcode/process_altcode.bash | 2 +- scripts/altcode/upload_stats.py | 696 +++++++++++++++++++++++ scripts/poolprocess.py | 27 +- 5 files changed, 888 insertions(+), 3 deletions(-) create mode 100755 processes/altcode/bin/summarize_stats.py create mode 100644 scripts/altcode/upload_stats.py diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index fceb2808..0e49e2c9 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -44,6 +44,9 @@ workflow { | flatMap { find_matrices_in_dir(it[0], it[1]) } | convert_to_h5ad + STAR_solo.out.solo_analysis + | map { [it[0], it[1], file(params.metadata)] } + | summarize_stats } // Helper functions @@ -170,3 +173,20 @@ process convert_to_h5ad { rm -r tmp ''' } + +process summarize_stats { + cpus 1 + publishDir params.outdir, mode: "copy" + + input: + tuple val(meta), path(solo_dir), path(metadata_json) + + output: + tuple val(meta), path(stats_json) + + script: + stats_json = "stats.json" + """ + summarize_stats.py "${solo_dir}" "${metadata_json}" > "${stats_json}" + """ +} diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py new file mode 100755 index 00000000..a56e1158 --- /dev/null +++ b/processes/altcode/bin/summarize_stats.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python + +import argparse +import csv +import json +import logging +import os +import pathlib +import pprint +import re + +from collections import defaultdict + +def parse_args(): + parser = argparse.ArgumentParser( + prog="summarize_stats.py", + description="Parses StarSOLO output and summarizes by barcode", + ) + parser.add_argument("solo_dir") + parser.add_argument("pool_info_file") + parser.add_argument("--subdir", default="GeneFull") + return parser.parse_args() + + +def parse_pool_info(filename): + with open(filename) as f: + json_data = json.load(f) + return json_data + + +def parse_cellreads(filename): + with open(filename) as f: + data = [] + for row in csv.DictReader(f, delimiter="\t"): + for (k, v) in row.items(): + try: + row[k] = int(v) + except: + pass + data.append(row) + return data + +def parse_summary_stats(filename): + with open(filename) as f: + data = {} + for line in f: + (key, val) = line.strip().split(",") + try: + val = int(val) + except ValueError: + val = float(val) + data[key] = val + return data + +def parse_barcode_stats(filename): + with open(filename) as f: + data = {} + for line in f: + match = re.match(r"\s*(\w+)\s*(\d+)\s*", line) + (key, val) = (match.group(1), match.group(2)) + data[key] = val + return data + +REVCOM = {"A": "T", "T": "A", "C": "G", "G": "C"} +def revcom(bc): + if bc is None: + return None + return "".join(REVCOM[x] for x in reversed(bc)) + + +def summarize_by_sample(pool_info, stats): + """ + Stats is a list of observed cell barcodes & how many we saw / how well they + mapped. + Given the stats and pool_info, sum/group them appropriately by barcode + Right now it's kind of backwards. + For example, a library with barcode2 = "TTTAAGCG" will contain all cells + that end with "_CGCTTAAA" (the reverse complement) + """ + def build_barcode_to_sample_lookup(pool_info, stats): + barcode_to_sample = {} + for lib in pool_info["libraries"]: + bc = revcom(lib["barcode2"]) + barcode_to_sample[bc] = lib["sample"] + return barcode_to_sample + + + # Stub out keys + data = { + "barcode_mapping": {}, + "barcode_stats": {}, + "flowcell_label": {}, + "pool": {}, + "samples": {}, + "summary_stats": {}, + } + + bc_to_sample = build_barcode_to_sample_lookup(pool_info, stats) + samples = {} + for cell in stats: + total_bc = cell["CB"] + if "_" not in total_bc: + if total_bc not in ["CBnotInPasslist"]: + logging.warning("Skipping possible barcode %s", total_bc) + continue + (_, _1, bc) = total_bc.split("_") + if bc not in samples: + samples[bc] = defaultdict(int) + for (k, v) in cell.items(): + if k == "CB": + continue + samples[bc][k] += int(v) + # Convert back to strings (ew) + for bc in samples: + for (k, v) in samples[bc].items(): + samples[bc][k] = str(v) + + pool_set = set(lib["pool"] for lib in pool_info["libraries"]) + assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set + data["pool"] = pool_set.pop() + flowcell_set = set(lib["flowcell"] for lib in pool_info["libraries"]) + assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + data["flowcell_label"] = flowcell_set.pop()[2:] + + data["barcode_mapping"] = bc_to_sample + data["samples"] = samples + + return data + +def main(): + opts = parse_args() + cfg = parse_pool_info(opts.pool_info_file) + + gene_dir = os.path.join(opts.solo_dir, opts.subdir) + cell_reads_filename = os.path.join(gene_dir, "CellReads.stats") + + samples = parse_cellreads(cell_reads_filename) + + data = summarize_by_sample(cfg, samples) + data["summary_stats"] = parse_summary_stats(os.path.join(gene_dir, "Summary.csv")) + data["barcode_stats"] = parse_barcode_stats(os.path.join(opts.solo_dir, "Barcodes.stats")) + + print(json.dumps(data)) + +if __name__ == "__main__": + main() diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 6d8fe739..80d4fe55 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -87,7 +87,7 @@ nextflow run \ # Create sentinel/status file if [[ -e "$status_file" ]] ; then - old_date=$(jq .completed_on <<< "$status_file") + old_date=$(jq .completed_on < "$status_file") old_status_file=${status_file/json/$old_date}.json mv "$status_file" "$old_status_file" fi diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py new file mode 100644 index 00000000..a1120691 --- /dev/null +++ b/scripts/altcode/upload_stats.py @@ -0,0 +1,696 @@ +#!/usr/bin/env python3 +""" +Uploads all the results of alt-code processing to LIMS +""" + +import pprint +import re +import csv +import argparse +import datetime +import hashlib +import json +import logging +import os +import sys +import glob +from functools import lru_cache +from collections import defaultdict +from distutils.version import LooseVersion + +# Make sure we can load our vendored stamlims_api dependency +sys.path.insert( + 1, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "lims", "stamlims_api" + ), +) + + +from stamlims_api import rest # pylint: disable=wrong-import-position,import-error + +JSON_REPORT_CLASS_SLUG = "altcode-flowcell-report-starsolo" + +LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +LOG = logging.getLogger("upload_data.py") + +script_options = { + "base_api_url": None, + "quiet": False, + "debug": False, + "dry_run": False, +} + + +class HashableDict(dict): + """ + A simple hashable dict + Helps cache our GET requests even w/ query params + """ + + def __hash__(self): + return hash(frozenset(self.items())) + + +def parser_setup(): + """Command-line argument setup""" + parser = argparse.ArgumentParser() + + run_opts = parser.add_argument_group("core params") + log_opts = parser.add_argument_group("logging options") + lims_opts = parser.add_argument_group("lims options") + + log_opts.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages (only WARN and higher).", + ) + log_opts.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages.", + ) + + lims_opts.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + lims_opts.add_argument( + "-t", "--token", dest="token", help="Your authentication token." + ) + + # TODO: Should we allow registering each align dir by itself? + run_opts.add_argument("flowcell_dir", help="The flowcell directory") + + #run_opts.add_argument("sample_config", help="The sample_config.tsv file") + #run_opts.add_argument("processing_json", help="The processing.json file") + #run_opts.add_argument( + # "--output_file_directory", + # default=".", + # help="The output directory files are stored in. Defaults to cwd.", + #) + + #run_opts.add_argument( + # "--skip_md5", + # dest="skip_md5", + # action="store_true", + # help="Don't calculate md5sum (debug/dev only)", + #) + + run_opts.add_argument( + "-n", + "--dry_run", + dest="dry_run", + action="store_true", + help="Do not upload anything to LIMS, instead print actions that would be taken", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) + + return parser + + +def md5sum_file(path): + """Calculates the md5sum of a file's contents""" + md5sum = hashlib.md5() + + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + md5sum.update(chunk) + + return md5sum.hexdigest() + + +def parse_counts_file(counts_file: str): + """ + Given a file name, reads a stats file + format: one stat per line: `name value` (separated by whitespace) + returns a dict of str->int + """ + stats = {} + with open(counts_file, "r") as counts: + for line in counts: + values = line.split() + count_type_name = values[0] + if not count_type_name: + continue + count = int(values[1]) + stats[count_type_name] = count + return stats + + +def build_counts(alignment_id, counts_file): + """ + Convert stats into a form ready to be uploaded to LIMS with the + bulk-stat-create endpoint + """ + parsed_stats = parse_counts_file(counts_file) + return { + "object_id": alignment_id, + "content_type": "SequencingData.flowcelllanealignment", + "stats": parsed_stats, + } + + +class UploadLIMS: + """ + Contains the logic for uploading things to LIMS + Uses caching for most GET requests + """ + + def __init__(self, api_url, token, dry_run=False, skip_md5=False): + # self.count_types = {} + # self.flowcelllane_contenttype = None + # self.alignment_contenttype = None + # self.aggregation_contenttype = None + self.api = rest.setup_api( + { + rest.LIMS_URL_OPT_VAR: api_url, + rest.LIMS_TOKEN_OPT_VAR: token, + #rest.RAISE_ON_ERROR_VAR: True, + } + ) + self.dry_run = dry_run + self.skip_md5 = skip_md5 + + @lru_cache(maxsize=None) + def get(self, url): + """Cached version of api.get_single_result""" + return self.api.get_single_result(url) + + def get_by_id(self, base_url, object_id, err_message=None): + """Constructs url from ID and calls get""" + url = "%s/%d/" % (base_url, object_id) + result = self.get(url) + if not result: + if err_message is None: + err_message = "Failed to fetch %s" % url + LOG.critical(err_message) + return result + + @lru_cache(maxsize=None) + def _get_single_result(self, fetch_url, query=None, field=None): + """Internal memo-izable function, do not use directly""" + result = self.api.get_single_list_result( + url_addition=fetch_url, query_arguments=query + ) + if result is None: + return None + if field is not None: + return result[field] + return result + + def get_single_result(self, fetch_url, query=None, field=None): + """ + Using a list API url that should bring up a single item, retrieve that + single item if it exists. + """ + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + return self._get_single_result(fetch_url, query, field) + + # Not currently used + @lru_cache(maxsize=None) + def _get_list_result(self, url, query=None): + return self.api.get_list_result( + url_addition=url, + query_arguments=query, + item_limit=1000000, + page_size=1000, + ) + + def get_list_result(self, url, query=None): + if isinstance(query, dict) and not isinstance(query, HashableDict): + query = HashableDict(query) + LOG.debug("Query is now: %s", query) + return self._get_list_result(url, query) + + def put(self, *args, **kwargs): + """ + PUT data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have put %s, %s", args, kwargs) + return None + # FIXME: Should use PUT method once API lib supports it + return self.api.patch_single_result(*args, **kwargs) + + def post(self, *args, **kwargs): + """ + POST data to LIMS + """ + if self.dry_run: + LOG.info("Dry run, would have post %s, %s", args, kwargs) + return None + return self.api.post_single_result(*args, **kwargs) + + def patch(self, *args, **kwargs): + if self.dry_run: + LOG.info("Dry run, would have patch %s, %s", args, kwargs) + return None + return self.api.patch_single_result(*args, **kwargs) + + # def get_flowcell_url_by_label(self, label): + # return self.get_single_result( + # "flowcell_run/", field="url", query={"label": label} + # ) + + def get_contenttype(self, contenttype_name): + """ + Appname uses capitalization, modelname does not. + """ + + (appname, modelname) = contenttype_name.split(".") + + query = { + "app_label": appname, + "model": modelname, + } + ct = self.get_single_result("content_type/", query=query) + if not ct: + LOG.critical("Could not fetch content type %s", contenttype_name) + + return ct + + def get_file_purpose_url(self, slug): + """Get file purpose url from slug""" + return self.get_single_result( + "file_purpose/", query={"slug": slug}, field="url" + ) + + def get_file_type_url(self, slug): + """Gets the file type URL for a slug""" + return self.get_single_result("file_type/", field="url", query={"slug": slug}) + + def upload_directory_attachment( + self, path, contenttype_name, object_id, file_purpose=None + ): + """Uploads a single directory to a LIMS object""" + path = os.path.abspath(path) + if not (contenttype_name and object_id): + LOG.error( + "Cannot attach file %s without both content type and object_id", path + ) + return False + + contenttype = self.get_contenttype(contenttype_name) + if not contenttype: + LOG.error("Cannot attach file %s without contenttype result", path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + if file_purpose and not purpose: + LOG.error( + "Could not find file purpose %s for uploading directory %s", + file_purpose, + path, + ) + return False + LOG.debug("File purpose: %s", purpose) + + existing_data = self.get_single_result("directory/", query={"path": path}) + data = existing_data if existing_data else {} + + data.update( + { + "path": path, + "content_type": contenttype["url"], + "object_id": object_id, + "purpose": purpose, + } + ) + + if existing_data: + LOG.info("Updating information for directory %s", path) + result = self.put(url=data["url"], data=data) + else: + LOG.info("Uploading information for directory %s", path) + result = self.post("directory/", data=data) + + if not result: + LOG.error("Could not upload directory %s", path) + LOG.debug(data) + else: + LOG.debug(result) + + return True + + def upload_file( + self, path, contenttype_name, object_ids, file_purpose=None, file_type=None + ): + """ + Upload a file's metadata to LIMS + It will be attached to many objects. + """ + # FIXME: This method makes a GET and PUT request for every single object + # Will require LIMS API updates to enable a more performant solution + + upload_data = self.get_file_upload_data( + path, contenttype_name, file_purpose, file_type + ) + LOG.debug("Uploading file %s, to %d objects", path, len(object_ids)) + if self.skip_md5: + LOG.info("Skipping md5sum") + upload_data["md5sum"] = "0" + else: + LOG.debug("Running md5sum...") + upload_data["md5sum"] = md5sum_file(path) + + content_type_id = re.search(r"(\d+)/?$", upload_data["content_type"]).group(1) + purpose_id = re.search(r"(\d+)/?$", upload_data["purpose"]).group(1) + for object_id in object_ids: + upload_data.update({"object_id": object_id}) + exists = self.get_single_result( + "file/", + query={ + "object_id": object_id, + "purpose": purpose_id, + "content_type": content_type_id, + }, + ) + + if exists: + if exists == upload_data: + LOG.info( + "No change to information for file %s, lane %d, not updating", + path, + object_id, + ) + result = True + else: + LOG.info( + "Updating information for file %s: lane %d", path, object_id + ) + result = self.put(url=exists["url"], data=upload_data) + else: + LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + result = self.post("file/", data=upload_data) + + if not result: + LOG.error("Could not upload file %s for ID %d", path, object_id) + LOG.debug(upload_data) + else: + LOG.debug(result) + + def get_file_upload_data( + self, path, contenttype_name, file_purpose=None, file_type=None + ): + """ + Gets the file upload data that is easy to query + (notable omission: md5sum, as it takes a long time to calculate) + """ + path = os.path.abspath(path) + + contenttype = self.get_contenttype(contenttype_name) + if not contenttype: + LOG.error("Cannot attach file %s without contenttype result", path) + return False + + purpose = self.get_file_purpose_url(file_purpose) + if file_purpose and not purpose: + LOG.error( + "Could not find file purpose %s for uploading file %s", + file_purpose, + path, + ) + return False + if purpose: + LOG.debug("File Purpose: %s", purpose) + + ftype = self.get_file_type_url(file_type) + if file_type and not ftype: + LOG.error( + "Could not find file type %s for uploading file %s", file_type, path + ) + return False + if file_type: + LOG.debug("File Type: %s", ftype) + + file_size = os.path.getsize(path) + last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + + # Current issue: sub-second precision. + data = { + "path": path, + "content_type": contenttype["url"], + "purpose": purpose, + "filetype": ftype, + "file_last_modified": last_modified, + "size_bytes": file_size, + } + + LOG.debug(data) + return data + + def get_flowcell_lane(self, flowcell_lane_id): + """Gets the flowcell lane by ID""" + return self.get_by_id("flowcell_lane", flowcell_lane_id) + + def get_library(self, library_id): + """Gets the library by ID (NOT library number)""" + return self.get_by_id("library", library_id) + + def upload_flowcell_report(self, data): + flowcell_labels = set(pool["flowcell_label"] for pool in data) + assert len(flowcell_labels) == 1 + flowcell_label = flowcell_labels.pop() + + report_name = "Alt-code stats: FC%s" % flowcell_label + + flowcell_lims_info = self.get_single_result( + "flowcell_run/?label=%s" % flowcell_label) + content_type_id = flowcell_lims_info['object_content_type'] + content_type = self.get_by_id("content_type", content_type_id) + object_id = flowcell_lims_info['id'] + json_report_class = self.get_single_result( + "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) + + # See if report already exists + existing_reports = self.get_list_result("json_report/", query={ + "object_id": object_id, + "content_type": content_type["id"], + "report_class": json_report_class["id"], + "page_size": 2, + }) + + data_to_send = { + "object_id": object_id, + "content_type": content_type["url"], + "report_class": json_report_class["url"], + "name": report_name, + "json_content": json.dumps(data), + } + if len(existing_reports) == 0: + self.post("json_report/", data=data_to_send) + # No report exists yet, upload a new one + elif len(existing_reports) == 1: + # Exactly one report, update it + url_to_patch = "json_report/%d/" % existing_reports[0]["id"] + self.patch(url_to_patch, data=data_to_send) + else: + # Error! too many reports + LOG.critical("Too many JSON reports exist") + raise "Too many JSON reports exist, exiting" + + + def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): + """ + Main function for this script. + Given paths to the sample_config file, processing_dict, and outdir, + upload to LIMS: + 1) Paths for fastq files for each lane + # 2) Stats for each alignment + 3) Flowcell-level pool stats + """ + # (Filepath, purpose) -> [lane_ids] + files_to_upload = defaultdict(list) + + # Augment processing_dict with sample_config info + processing_info = [] + for row in sample_config: + barcode_index = row["barcode_index"] + lane = int(row["lane"]) + pool_name = row["pool_name"] + sample_name = row["sample_name"] + for idx, lib in enumerate(processing_dict["libraries"]): + if int(lib["lane"]) == lane and lib["barcode_index"] == barcode_index: + lib.update({"pool_name": pool_name, "sample_name": sample_name}) + processing_info.append(lib) + + # TODO: Doesn't yet make use of the above augmented info + for row in sample_config: + (idx, _otheridx) = row["barcode_index"].split("-") + lane = int(row["lane"]) + name = row["pool_name"] + LOG.debug("idx=%s, lane=%d, name=%s", idx, lane, name) + # Get lane IDs for each file + lane_ids = [ + l["id"] + for l in processing_dict["libraries"] + if l["barcode1"]["reverse_sequence"] == idx and int(l["lane"]) == lane + ] + r1_file = os.path.join(outdir, name, "R1.fq.gz") + r2_file = os.path.join(outdir, name, "R2.fq.gz") + if not os.path.exists(r1_file): + raise Exception("No file %s" % r1_file) + if not os.path.exists(r2_file): + raise Exception("No file %s" % r2_file) + + files_to_upload[(r1_file, "r1-fastq")].extend(lane_ids) + files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) + + # Upload files. + for ((path, purpose), lane_ids) in files_to_upload.items(): + # print(path, purpose, len(lane_ids)) + self.upload_file( + path, + "SequencingData.flowcelllane", + list(set(lane_ids)), + file_purpose=purpose, + file_type="fastq", + ) + + # Commented out because we aren't making alignments for these... + # # Now upload counts. + # # We can do this all as one call. + # # (Assuming LIMS doesn't time out) + # all_counts = [] + # for lib in processing_info: + # if not len(lib["alignments"]) == 1: + # LOG.critical("Lib must have exactly 1 aligment %s", lib) + # align_id = lib["alignments"][0]["id"] + # counts_file = os.path.join( + # outdir, + # lib["pool_name"], + # "analysis", + # "Gene", + # "%s.stats.txt" % lib["sample_name"], + # ) + # all_counts.append(build_counts(align_id, counts_file)) + # # print(json.dumps(all_counts)) + # self.post("stats/create/", all_counts) + + with open(os.path.join(outdir, "flowcell_stats.json")) as json_file: + flowcell_data = json.loads(json_file.read()) + self.upload_flowcell_report(flowcell_data) + +def find_stat_files_in_dir(flowcell_directory): + """ + Given a directory to search, finds the newest alt-code stats files + """ + path = "%s/Project_*/LibraryPool_*/align*" % flowcell_directory + LOG.debug("Searching path %s", path) + align_dirs = glob.glob(path) + LOG.debug("Align dirs: %s", align_dirs) + newest_stat_files = [] + version_regex = re.compile( + r"""output + [_-]? + (?P[0-9.]+) + [_-]? + (?P[a-z]+)? + [_-]? + (?P[0-9.]*) + """, re.VERBOSE | re.IGNORECASE) + def sortkey(a): + # Logic: there are up to 3 parts + # 1) Regular version, like "2.0" or "3.2.1". + # 2) pre-release, like "alpha" + # 3) teeny suffix after alpha, like "3" or "1" + match = version_regex.match(os.path.basename(a)) + versn = match.group("versn") if match and match.group("versn") else 0 + greek = match.group("greek") if match and match.group("greek") else "zzzzzz" # last + teeny = match.group("teeny") if match and match.group("teeny") else 0 + return (LooseVersion(versn), greek, LooseVersion(teeny)) + + for align_dir in align_dirs: + # TODO: should we filter to output directories that have output? completed status in status.json? + out_dirs = glob.glob("%s/output*" % align_dir) + LOG.debug("Considering: %s", out_dirs) + if len(out_dirs) == 0: + LOG.error("No output directories in %s", align_dir) + continue + out_dir = max(out_dirs, key=sortkey) + LOG.debug("Selected: %s", out_dir) + file = os.path.join(out_dir, "stats.json") + newest_stat_files.append(file) + return newest_stat_files + + +def main(): + """ + This is the main body of the program that uses the arguments from the + command line. + """ + + parser = parser_setup() + poptions = parser.parse_args() + + if poptions.quiet: + logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT) + elif poptions.debug: + logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) + else: + # Set up the default logging levels + logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) + # Make this a little less noisy by default + requests_log = logging.getLogger("requests.packages.urllib3.connectionpool") + requests_log.setLevel(logging.WARN) + + if not poptions.base_api_url and "LIMS_API_URL" in os.environ: + api_url = os.environ["LIMS_API_URL"] + LOG.debug("Using LIMS API endpoint: %s from environment", api_url) + elif poptions.base_api_url: + api_url = poptions.base_api_url + LOG.debug("Using LIMS API endpoint: %s from options", api_url) + else: + sys.stderr.write("Could not find LIMS API URL.\n") + sys.exit(1) + + if not poptions.token and "LIMS_API_TOKEN" in os.environ: + token = os.environ["LIMS_API_TOKEN"] + elif poptions.token: + token = poptions.token + else: + sys.stderr.write("Could not find LIMS API TOKEN.\n") + sys.exit(1) + + uploader = UploadLIMS( + api_url, token, dry_run=poptions.dry_run, + ) + + stats_files = find_stat_files_in_dir(poptions.flowcell_dir) + LOG.debug("Stats files: %s", stats_files) + all_stats = [] + for file_name in stats_files: + try: + with open(file_name) as f: + all_stats.append(json.load(f)) + except Exception as e: + LOG.error("Could not read file %s: %s", file_name, e) + LOG.debug("All stats: %s", all_stats) + + uploader.upload_flowcell_report(all_stats) + + + #with open(poptions.sample_config) as f: + # sample_config = list(csv.DictReader(f, delimiter="\t")) + #with open(poptions.processing_json) as f: + # processing = json.loads(f.read()) + #uploader.upload_altcode_flowcell( + # sample_config, processing, poptions.output_file_directory + #) + + +# This is the main body of the program that only runs when running this script +# doesn't run when imported, so you can use the functions above in the shell +# after importing without automatically running it +if __name__ == "__main__": + main() diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index fef4ef1b..8d0ee30f 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -250,6 +250,7 @@ def setup_flowcell(self, flowcell_label): logging.debug("align ids: %s", align_ids) #alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) self.setup_alignments(align_ids) + self.add_stats_upload(flowcell_label) def get_alignment_ids(self, flowcell_label: str) -> [int]: """ @@ -359,6 +360,26 @@ def add_script(self, align_id, processing_info, script_file, sample_name): outfile.write("jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING=\"$PROCESSING,$jobid\"\n\n" % (fullname, fullname, fullname, self.qsub_queue, ram_megabytes, script_file)) outfile.close() + + def add_stats_upload(self, flowcell_label): + job_name = ".upload-altcode-%s" % flowcell_label + template = textwrap.dedent( + """\ + cd "$FLOWCELLS"/FC{label}_* + sentinel_dependencies=$(echo $PROCESSING | sed -e 's/,/,afterany:/g' | sed -e 's/^,afterany/--dependency=afterany/g') + sbatch --export=ALL -J {job_name} -o {job_name}.o%A -e {job_name}.e%A --partition={queue} --cpus-per-task=1 --ntasks=1 $sentinel_dependencies --mem-per-cpu=1000 --parsable --oversubscribe <<__UPLOAD_POOL_DATA__ + #!/bin/bash + python $STAMPIPES/scripts/altcode/upload_stats.py "$PWD" + __UPLOAD_POOL_DATA__""") + content = template.format( + label=flowcell_label, + job_name=job_name, + queue=self.qsub_queue, + ) + + with open(self.outfile, 'a') as outfile: + outfile.write(content) + def get_script_template(self, process_template): if self.script_template: @@ -524,9 +545,9 @@ def create_script(self, processing_info, align_id): outfile.close() # Create the config file as well - self.create_sample_config(processing_info, alignment, script_directory) + self.create_sample_config(processing_info, alignment, script_directory, pool_name) - def create_sample_config(self, processing_info, alignment, script_directory): + def create_sample_config(self, processing_info, alignment, script_directory, pool_name): alignment_id = int(alignment["id"]) logging.debug("Creating sample config for ALN%d", alignment_id) @@ -704,10 +725,12 @@ def info_to_data(well_info): "lentitale_from_tc_notes": extract_lenti_from_tc_notes(tc_info["notes"]), "cell_type": tc_info["sample_taxonomy__name"], "sample_plate_wells": sample_plate_wells(sample_info), + "library_plate_wells": sample_plate_wells(lib_info), "project": project_info["name"], "flowcell": flowcell_label, "cycle": cycle, "effector_pools": pool_info, + "pool": pool_name, } return info From bf45a5e2a4a9bd76c928c110f46d3f69253ce51e Mon Sep 17 00:00:00 2001 From: msb Date: Wed, 25 Oct 2023 09:27:50 -0700 Subject: [PATCH 102/172] added Effector Assembly QC to lentitale object --- scripts/poolprocess.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 8d0ee30f..bde3a1bd 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,13 +1,15 @@ #import csv -import json -import os -import sys import argparse +import json import logging +import os import re -import requests +import sys import textwrap from collections import OrderedDict, defaultdict + +import requests + try: from concurrent.futures import ThreadPoolExecutor except ImportError: @@ -682,6 +684,7 @@ def match_notes(regex): moi_estimate = match_notes(r"MOI Estimate: (.+?)\s*$") virus_volume = match_notes(r"Virus volume: (\d+)\s*$") lenti_x_content = match_notes(r"Lenti-X Content: (.+?)\s*$") + effector_assembly_qc = match_notes(r"Effector Assembly QC: (.+?)\s*$") return { "talen_number": talen_number, @@ -690,6 +693,7 @@ def match_notes(regex): "moi_estimate": moi_estimate, "virus_volume": virus_volume, "lenti_x_content": lenti_x_content, + "effector_assembly_qc": effector_assembly_qc, } def sample_plate_wells(sample_info): From 0910298fe760842414767fd299241d123a2b9978 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 25 Oct 2023 11:37:53 -0700 Subject: [PATCH 103/172] edgecase fix for altcode stat finding --- scripts/altcode/upload_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py index a1120691..27d51aa0 100644 --- a/scripts/altcode/upload_stats.py +++ b/scripts/altcode/upload_stats.py @@ -608,7 +608,8 @@ def sortkey(a): versn = match.group("versn") if match and match.group("versn") else 0 greek = match.group("greek") if match and match.group("greek") else "zzzzzz" # last teeny = match.group("teeny") if match and match.group("teeny") else 0 - return (LooseVersion(versn), greek, LooseVersion(teeny)) + length = -1 * len(a) # Prefer shorter names to longer + return (LooseVersion(versn), greek, LooseVersion(teeny), length) for align_dir in align_dirs: # TODO: should we filter to output directories that have output? completed status in status.json? From d8a8d4343375e34df121a30e709c51c424d347e8 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 25 Oct 2023 11:40:16 -0700 Subject: [PATCH 104/172] altcode alpha4 - summarize by library --- processes/altcode/bin/summarize_stats.py | 60 +++++++++++++++++++++++- processes/altcode/nextflow.config | 2 +- processes/altcode/process_altcode.bash | 2 +- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index a56e1158..ad42548e 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -67,6 +67,64 @@ def revcom(bc): return None return "".join(REVCOM[x] for x in reversed(bc)) +def summarize_by_library(pool_info, stats): + """ + Stats is a list of observed cell barcodes & how many we saw / how well they + mapped. + Given the stats and pool_info, sum/group them appropriately by barcode + Right now it's kind of backwards. + For example, a library with barcode2 = "TTTAAGCG" will contain all cells + that end with "_CGCTTAAA" (the reverse complement) + """ + def build_barcode_to_library_lookup(pool_info, stats): + barcode_to_library = {} + for lib in pool_info["libraries"]: + bc = revcom(lib["barcode2"]) + barcode_to_library[bc] = lib["library"] + return barcode_to_library + + + # Stub out keys + data = { + "barcode_mapping": {}, + "barcode_stats": {}, + "flowcell_label": {}, + "pool": {}, + "libraries": {}, + "summary_stats": {}, + } + + bc_to_library = build_barcode_to_library_lookup(pool_info, stats) + libraries = {} + for cell in stats: + total_bc = cell["CB"] + if "_" not in total_bc: + if total_bc not in ["CBnotInPasslist"]: + logging.warning("Skipping possible barcode %s", total_bc) + continue + (_, _1, bc) = total_bc.split("_") + if bc not in libraries: + libraries[bc] = defaultdict(int) + for (k, v) in cell.items(): + if k == "CB": + continue + libraries[bc][k] += int(v) + # Convert back to strings (ew) + for bc in libraries: + for (k, v) in libraries[bc].items(): + libraries[bc][k] = str(v) + + pool_set = set(lib["pool"] for lib in pool_info["libraries"]) + assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set + data["pool"] = pool_set.pop() + flowcell_set = set(lib["flowcell"] for lib in pool_info["libraries"]) + assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + data["flowcell_label"] = flowcell_set.pop()[2:] + + data["barcode_mapping"] = bc_to_library + data["libraries"] = libraries + + return data def summarize_by_sample(pool_info, stats): """ @@ -136,7 +194,7 @@ def main(): samples = parse_cellreads(cell_reads_filename) - data = summarize_by_sample(cfg, samples) + data = summarize_by_library(cfg, samples) data["summary_stats"] = parse_summary_stats(os.path.join(gene_dir, "Summary.csv")) data["barcode_stats"] = parse_barcode_stats(os.path.join(opts.solo_dir, "Barcodes.stats")) diff --git a/processes/altcode/nextflow.config b/processes/altcode/nextflow.config index d5619377..a18cf187 100644 --- a/processes/altcode/nextflow.config +++ b/processes/altcode/nextflow.config @@ -12,5 +12,5 @@ process { apptainer { enabled = true cacheDir = "$HOME/.apptainer_nextflow_cache" - runOptions = "--bind /net/seq/data2/,/net/seq/data/,${baseDir}" + runOptions = "--bind /net/seq/scratch,/net/seq/data2/,/net/seq/data/,${baseDir}" } diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 80d4fe55..f497e523 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha3 +version=1.0.0-alpha4 cd "$(dirname "$0")" # Temporarily hardcoded! From 3a20e4138c153ec7bdacf76da820c2f8959b4696 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 31 Oct 2023 16:35:29 -0700 Subject: [PATCH 105/172] Fix whitespace --- processes/altcode/altcode.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 0e49e2c9..626be795 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -136,7 +136,7 @@ process STAR_solo { || kill 0) & samtools sort \ - --reference "${genome_fasta}" \ + --reference "${genome_fasta}" \ -o Aligned.out.cram \ --output-fmt-option "version=3.0,level=7" \ --threads "${num_threads}" \ From ee5fbd8d98eba01a9650126d8e85ff8596dfa234 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 31 Oct 2023 16:36:38 -0700 Subject: [PATCH 106/172] New metadata fmt --- processes/altcode/bin/summarize_stats.py | 20 ++-- scripts/poolprocess.py | 115 +++++++++++++++++++++-- 2 files changed, 122 insertions(+), 13 deletions(-) diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index ad42548e..9302c105 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -79,8 +79,16 @@ def summarize_by_library(pool_info, stats): def build_barcode_to_library_lookup(pool_info, stats): barcode_to_library = {} for lib in pool_info["libraries"]: - bc = revcom(lib["barcode2"]) - barcode_to_library[bc] = lib["library"] + #bc = revcom(lib["barcode2"]) + # Some old backward-compatibility + # Newer stuff is at the top of the list + if "sample_barcode" in lib: + bc = lib["sample_barcode"] + elif "barcode2" in lib: + bc = revcom(lib["sample_barcode"]) + elif "additional_information" in lib and "barcode2" in lib["additional_information"]: + bc = revcom(lib["additional_information"]["sample_barcode"]) + barcode_to_library[bc] = lib["LN#"] return barcode_to_library @@ -114,10 +122,10 @@ def build_barcode_to_library_lookup(pool_info, stats): for (k, v) in libraries[bc].items(): libraries[bc][k] = str(v) - pool_set = set(lib["pool"] for lib in pool_info["libraries"]) + pool_set = set(lib["library_pool"] for lib in pool_info["libraries"]) assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set data["pool"] = pool_set.pop() - flowcell_set = set(lib["flowcell"] for lib in pool_info["libraries"]) + flowcell_set = set(lib["additional_information"]["flowcell"] for lib in pool_info["libraries"]) assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set data["flowcell_label"] = flowcell_set.pop()[2:] @@ -173,10 +181,10 @@ def build_barcode_to_sample_lookup(pool_info, stats): for (k, v) in samples[bc].items(): samples[bc][k] = str(v) - pool_set = set(lib["pool"] for lib in pool_info["libraries"]) + pool_set = set(lib["library_pool"] for lib in pool_info["libraries"]) assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set data["pool"] = pool_set.pop() - flowcell_set = set(lib["flowcell"] for lib in pool_info["libraries"]) + flowcell_set = set(lib["additional_information"]["flowcell"] for lib in pool_info["libraries"]) assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set data["flowcell_label"] = flowcell_set.pop()[2:] diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index bde3a1bd..067d839d 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -345,7 +345,7 @@ def extract_id_from_url(url): def add_script(self, align_id, processing_info, script_file, sample_name): - ram_megabytes = 2000 + ram_megabytes = 4000 if not self.outfile: logging.debug("Writing script to stdout") @@ -589,6 +589,12 @@ def get_libraries_in_pool(alignment_id): lib_ids = get_libraries_in_pool(alignment_id) def build_library_info(lib_id, flowcell_label): + errors = [] + def add_error(fmt, *args): + err_msg = fmt % args + errors.append(err_msg) + logging.error(fmt, *args) + lib_info = self.api_single_result("library/%d/" % lib_id) barcode = "" bc1 = lib_info["barcode1__sequence"] @@ -614,9 +620,9 @@ def build_library_info(lib_id, flowcell_label): if match: cycle = int(match.group()) else: - logging.error("problem tag slug is '%s'" % tag_slug) + add_error("problem tag slug is '%s'", tag_slug) else: - logging.warning("Multiple tags for LN%d", lib_info["number"]) + add_error("Multiple tags for LN%d", lib_info["number"]) def build_effector_info(effectortopool): eff = effectortopool["assemble_effector"] @@ -696,7 +702,7 @@ def match_notes(regex): "effector_assembly_qc": effector_assembly_qc, } - def sample_plate_wells(sample_info): + def sample_plate_wells(sample_info) -> "[dict]": def info_to_data(well_info): match = re.match(r"(.*) ([A-Z0-9]{2})", well_info["object_name"]) well_data = { @@ -716,8 +722,15 @@ def info_to_data(well_info): wells.append(well_data) return wells + def reverse_complement(bc: "Optional[str]") -> "Optional[str]": + if bc is None: + return None + lookup = {"A":"T", "T":"A", "C":"G", "G":"C"} + return "".join(lookup[c] for c in bc) - info = { + lenti_from_tc = extract_lenti_from_tc_notes(tc_info["notes"]) + lib_plate_wells = sample_plate_wells(lib_info) + deep_info = { "barcode": barcode, "barcode1": bc1, "barcode2": bc2, @@ -726,16 +739,104 @@ def info_to_data(well_info): "sample": "DS%d" % lib_info["sample_number"], "tc": "TC%d" % tc_info["number"], "tc_notes": tc_info["notes"], - "lentitale_from_tc_notes": extract_lenti_from_tc_notes(tc_info["notes"]), + "lentitale_from_tc_notes": lenti_from_tc, "cell_type": tc_info["sample_taxonomy__name"], + "library_plate_wells": lib_plate_wells, "sample_plate_wells": sample_plate_wells(sample_info), - "library_plate_wells": sample_plate_wells(lib_info), "project": project_info["name"], "flowcell": flowcell_label, "cycle": cycle, "effector_pools": pool_info, "pool": pool_name, } + + ##################################### + # Refine our data to match the spec # + ##################################### + seq_well_label = None + seq_well_plate = None + sample_well_label = None + sample_well_plate = None + tc_well_label = None + tc_well_plate = None + try: + seq_well_label = lib_plate_wells[0]["well_label"] + seq_well_plate = "PL%d" % lib_plate_wells[0]["plate_id"] + sample_well_label = lib_plate_wells[0]["well_parent"]["well_label"] + sample_well_plate = "PL%d" % lib_plate_wells[0]["well_parent"]["plate_id"] + tc_well_label = lib_plate_wells[0]["well_parent"]["well_parent"]["well_label"] + tc_well_plate = "PL%d" % lib_plate_wells[0]["well_parent"]["well_parent"]["plate_id"] + except Exception as e: + add_error("Could not find well info in %s", lib_plate_wells) + + if pool_info: + talen_name = None #TODO + else: + talens_str = lenti_from_tc["talen_name"] + # Sort to normalize for string comparison + if talens_str is not None: + talens = sorted([t.strip() for t in talens_str.split(",")]) + talen_name = ",".join(talens) + else: + talen_name = None + + lenti_qc_passed = lenti_from_tc["effector_assembly_qc"] is None + + if sample_info["time_point_unit"] == 5: + # harvest timepoint is in days + harvest_timepoint = float(sample_info["time_point"]) + else: + add_error("Sample timepoint unit unknown") + harvest_timepoint = None + + # def parse_talen_names_from_tc_notes(notes): + # def match_notes(regex): + # match = re.search(regex, notes, re.MULTILINE | re.IGNORECASE) + # if match is None: + # return None + # return match.group(1) + # talen_new = match_notes(r"Talen Number:\s*(.+?)\s*$") + # talen_orig = match_notes(r"Original TALE name:\s*(.+?)\s*$") + # if talen_new is not None and talen_orig is not None: + # # Sort to make matching easier + # split_new = [s.strip() for s in talen_new.split(",")] + # split_orig = [s.strip() for s in talen_new.split(",")] + # if len(split_new) != len(split_orig): + # log.warning("Length of new and old talens differ") + # break + # sorted_pairs = sorted(zip(split_new, split_orig)) + # talen_new = [p[0] for p in sorted_pairs] + # talen_orig = [p[1] for p in sorted_pairs] + # return (talen_orig, talen_new) + + # (talen_orig, talen_new) = parse_talen_names_from_tc_notes(tc_info["notes"]) + + info = { + "sequencing_barcode_well": seq_well_label, + "sequencing_barcode_plate": seq_well_plate, + "sample_well": sample_well_label, + "sample_plate": sample_well_plate, + "perturbation_well": tc_well_label, + "perturbation_plate": tc_well_plate, + "day": harvest_timepoint, + "talen_name": talen_name, + "cell_type": tc_info["sample_taxonomy__name"], + "cycle": cycle, + "tale_target_name": "TODO", + "tale_target_master_gene_id": "TODO", + "effector_purpose": "TODO", + "library_pool": pool_name, + "TC#": "TC%d" % tc_info["number"], + "DS#": "DS%d" % sample_info["number"], + "LN#": "LN%d" % lib_info["number"], + "TL#_original": lenti_from_tc["talen_name"], + "TL#_new": lenti_from_tc["talen_number"], + "sample_barcode": reverse_complement(bc2), + "lenti_qc_passed": True, + + "script_errors": errors, + "additional_information": deep_info, + } return info From eb838b022e30ec89610538aac717b874310aa3dc Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 31 Oct 2023 16:36:50 -0700 Subject: [PATCH 107/172] Bump to version 1.0.0-alpha5 Getting close --- processes/altcode/process_altcode.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index f497e523..fdc85252 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha4 +version=1.0.0-alpha5 cd "$(dirname "$0")" # Temporarily hardcoded! From 2e57cb147f1eadc0a531b34c2f910e4cee92846e Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 2 Nov 2023 14:09:53 -0700 Subject: [PATCH 108/172] altcode metadata: Fix barcode field --- scripts/poolprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 067d839d..65a69110 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -726,7 +726,7 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": if bc is None: return None lookup = {"A":"T", "T":"A", "C":"G", "G":"C"} - return "".join(lookup[c] for c in bc) + return "".join(reversed([lookup[c] for c in bc])) lenti_from_tc = extract_lenti_from_tc_notes(tc_info["notes"]) lib_plate_wells = sample_plate_wells(lib_info) From 1b73daa62ed0760fc5da2bbe4bc93fde8f8ad282 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 26 Nov 2023 13:38:42 -0800 Subject: [PATCH 109/172] Add locus info --- scripts/poolprocess.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 65a69110..894dadee 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -22,6 +22,12 @@ LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +# Keys to copy directly from the api/locus/ endpoint +LOCUS_KEYS = ["genome_label", "chromosome_name", "genes", "name", + "genomic_feature", "genomic_coordinate_genome_label", + "genomic_coordinate_chromosome_name", "genomic_coordinate_start", + "genomic_coordinate_end"] + STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') SCRIPT_OPTIONS = { @@ -661,10 +667,22 @@ def build_effector_info(effectortopool): pool_info = [] for effector_pool in tc_info["effector_pools"]: effector_pool_info = self.api_single_result(url=effector_pool["url"]) + loci_info = [] + if effector_pool_info.get("loci", False): + for locus_url in effector_pool_info["loci"]: + locus_info = self.api_single_result(url=locus_url) + locus_dict = { + "label": locus_info.get("object_label"), + } + for key in LOCUS_KEYS: + locus_dict[key] = locus_info.get(key, None) + loci_info.append(locus_dict) + pool_info.append({ "effector_pool": effector_pool_info["object_name"], "name": effector_pool_info["name"], "purpose": effector_pool_info["purpose__name"], + "loci": loci_info, "effectors": [ build_effector_info(efftopool) for efftopool in effector_pool_info["effectortopool_set"] From 12fc6b6cfc960028550b6b97311238e2214751d4 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 Jan 2024 16:19:03 -0800 Subject: [PATCH 110/172] altcode: verify cram copy completion --- processes/altcode/process_altcode.bash | 2 ++ 1 file changed, 2 insertions(+) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index fdc85252..59553fee 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -92,6 +92,8 @@ if [[ -e "$status_file" ]] ; then mv "$status_file" "$old_status_file" fi +samtools quickcheck "$outdir/Aligned.out.cram" + # TODO: What else do we want to capture here? It would be nice to at least # capture the command used and relevant env vars echo | jq . > "$status_file" < Date: Tue, 9 Jan 2024 16:20:19 -0800 Subject: [PATCH 111/172] altcode: fix issue with double-copy --- processes/altcode/altcode.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 626be795..c245aeef 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -70,7 +70,7 @@ def pos_to_str(start, length) { /// This process creates the Aligned.out.cram file and STARsolo analysis results process STAR_solo { - publishDir params.outdir, mode: "copy" + publishDir params.outdir, mode: "copy", saveAs: { f -> f.name == "Solo.out" ? null : f } cpus 30 memory "80 GB" //scratch false From b8a6b3ef4d69572d6d8e5c1ddc5808aa333d0019 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 Jan 2024 16:20:51 -0800 Subject: [PATCH 112/172] altcode: improve metadata talen information --- scripts/poolprocess.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 894dadee..1cf92c22 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -632,12 +632,16 @@ def add_error(fmt, *args): def build_effector_info(effectortopool): eff = effectortopool["assemble_effector"] + talen_number = eff["talen_sheet_number"] + talen = "TL%s" % talen_number if talen_number else None + return { "chromosome": eff["chromosome"], "start": eff["start"], "end": eff["end"], "strand": eff["strand"], "working_name": eff["working_name"], + "talen": talen, "n_terminus": { "name": eff["effector__n_terminus__name"], @@ -787,13 +791,30 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": except Exception as e: add_error("Could not find well info in %s", lib_plate_wells) + def sort_talens(tls): + """ Sort talens by number """ + def get_num(tl): + assert tl[:2] == "TL" + return int(tl[2:]) + return sorted(tls, key=get_num) + if pool_info: - talen_name = None #TODO + #talen_name = None #TODO + talen_names = [] + for pool in deep_info["effector_pools"]: + for effector in pool.get("effectors", []): + if effector["talen"]: + talen_names.append(effector["talen"]) + + talen_name = ",".join(sort_talens(talen_names)) + orig_talen_name = talen_name + else: talens_str = lenti_from_tc["talen_name"] + orig_talen_name = lenti_from_tc["talen_name"] # Sort to normalize for string comparison if talens_str is not None: - talens = sorted([t.strip() for t in talens_str.split(",")]) + talens = sort_talens([t.strip() for t in talens_str.split(",")]) talen_name = ",".join(talens) else: talen_name = None @@ -847,7 +868,7 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": "TC#": "TC%d" % tc_info["number"], "DS#": "DS%d" % sample_info["number"], "LN#": "LN%d" % lib_info["number"], - "TL#_original": lenti_from_tc["talen_name"], + "TL#_original": orig_talen_name, "TL#_new": lenti_from_tc["talen_number"], "sample_barcode": reverse_complement(bc2), "lenti_qc_passed": True, From 715017a84c298fa4af0d5a1800bfb979d6ceb154 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 Jan 2024 16:21:42 -0800 Subject: [PATCH 113/172] altcode: retain unmapped reads in CRAM file --- processes/altcode/altcode.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index c245aeef..faf11fd2 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -128,6 +128,7 @@ process STAR_solo { --limitBAMsortRAM "${bam_sort_RAM}" \ --outSAMtype BAM Unsorted \ --outSAMattributes NH HI AS nM CR CY UR UY sM \ + --outSAMunmapped Within \ --outBAMcompression 0 \ --outBAMsortingThreadN "${num_threads}" \ --readFilesCommand zcat \ From 4212fd306b19c57b53010420bcd0d8b478c1fe73 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 Jan 2024 16:22:42 -0800 Subject: [PATCH 114/172] altcode: alpha version bump --- processes/altcode/process_altcode.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 59553fee..9519c6fd 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha5 +version=1.0.0-alpha6 cd "$(dirname "$0")" # Temporarily hardcoded! From 6278fb9602f32df6acc30eec9f9ce7fd2f8defd2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 16 Jan 2024 15:35:45 -0800 Subject: [PATCH 115/172] Fixes for altcode alpha6 --- processes/altcode/altcode.nf | 20 +++++++++----------- processes/altcode/process_altcode.bash | 1 + 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index faf11fd2..c887fbcb 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -9,6 +9,7 @@ params.metadata = "" /// Processes a single sample workflow { + // This function finds the matrix.gz files that we want to convert to h5ad def find_matrices_in_dir = { m, dir -> def mtx_files = [] dir.eachFileRecurse { f -> @@ -70,7 +71,7 @@ def pos_to_str(start, length) { /// This process creates the Aligned.out.cram file and STARsolo analysis results process STAR_solo { - publishDir params.outdir, mode: "copy", saveAs: { f -> f.name == "Solo.out" ? null : f } + publishDir params.outdir, mode: "copy" cpus 30 memory "80 GB" //scratch false @@ -88,6 +89,7 @@ process STAR_solo { output: tuple(val(meta), path("Aligned.out.cram*"), emit: cram) + tuple(val(meta), path("Solo.out/???**"), emit: solo_files) tuple(val(meta), path("Solo.out"), emit: solo_analysis) @@ -107,10 +109,7 @@ process STAR_solo { num_threads = 30 """ - set -o monitor - mkfifo Aligned.out.bam - - (STAR \ + STAR \ --soloCellReadStats Standard \ --clip3pAdapterSeq AAAAAAAAAA \ --clip3pAdapterMMp 0.1 \ @@ -133,9 +132,8 @@ process STAR_solo { --outBAMsortingThreadN "${num_threads}" \ --readFilesCommand zcat \ --outFileNamePrefix ./ \ - --limitOutSJcollapsed 5000000 \ - || kill 0) & - + --limitOutSJcollapsed 5000000 + samtools sort \ --reference "${genome_fasta}" \ -o Aligned.out.cram \ @@ -145,7 +143,6 @@ process STAR_solo { -T "tmpsort" \ Aligned.out.bam - wait rm Aligned.out.bam compress_mtx_files.sh ./Solo.out "${num_threads}" """ @@ -154,7 +151,7 @@ process STAR_solo { process convert_to_h5ad { cpus 1 memory "10 GB" - publishDir params.outdir, mode: "copy", saveAs: {f -> "$out_dir/$f"} + publishDir params.outdir, mode: "copy" input: tuple(val(meta), path(metadata_file), path(matrix), path(barcodes), path(features), val(out_dir)) @@ -163,10 +160,11 @@ process convert_to_h5ad { tuple(val(meta), path(out_file)) shell: - out_file = "${matrix.simpleName}.h5ad" + out_file = "${out_dir}/${matrix.simpleName}.h5ad" // scanpy requires specific file names ''' mkdir -p tmp + mkdir -p "!{out_dir}" cp "!{matrix}" tmp/matrix.mtx.gz cp "!{barcodes}" tmp/barcodes.tsv.gz cp "!{features}" tmp/features.tsv.gz diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 9519c6fd..7a5f0e70 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -32,6 +32,7 @@ module load jdk/11.0.16 module load nextflow/22.04.3 module load python/3.5.1 module load apptainer/1.1.2 +module load samtools/1.14 export NXF_VER=23.04.2 From 02abd7b6c2604c4a9b58fe688528ef2e1b0ce8d9 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 22 Jan 2024 16:19:24 -0800 Subject: [PATCH 116/172] Fix miniseq automatic processing --- nextflow.config | 1 + processes/altseq/altseq.nf | 10 ++++++++-- processes/altseq/bin/analyze.py | 2 +- processes/altseq/process_altseq.bash | 2 +- scripts/altcode/upload_fastq.py | 2 ++ scripts/flowcells/setup.sh | 29 +++++++++++++++++++++++++--- scripts/lims/upload_data.py | 2 +- 7 files changed, 40 insertions(+), 8 deletions(-) diff --git a/nextflow.config b/nextflow.config index 61343c28..3cf78454 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,6 +83,7 @@ profiles { enabled = true fixOwnership = true temp = 'auto' + docker.runOptions = '-u $(id -u):$(id -g)' } } apptainer { diff --git a/processes/altseq/altseq.nf b/processes/altseq/altseq.nf index be8fefd7..38f82c1d 100644 --- a/processes/altseq/altseq.nf +++ b/processes/altseq/altseq.nf @@ -9,6 +9,7 @@ include { sort_and_encode_cram } from "../../modules/cram.nf" params.sample_config_tsv = "" params.input_directory = "" params.star_exe = "${workflow.projectDir}/../../third_party/STAR" +params.samtools_exe = "/net/module/sw/samtools/1.14/samtools-1.14/bin/samtools" params.outdir = "output" params.publishmode = "link" @@ -115,6 +116,7 @@ workflow ALTSEQ { genome_dir, genome_fa, params.star_exe, + params.samtools_exe, barcode_whitelist, merged_fq_files, ) @@ -168,6 +170,7 @@ workflow { println "Running test workflow..." def star_exe = file("${workflow.projectDir}/../../third_party/STAR") + def samtools_exe = file("/net/module/sw/samtools/1.14/samtools-1.14/bin/samtools") def genome_dir = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/") def genome_fa = file("/net/seq/data2/projects/prime_seq/cell_ranger_ref/GRCh38-2022-Altius-gencode.v39-build/Homo_sapiens.GRCh38.dna.primary_assembly.fa.modified") def barcode_whitelist = file("/net/seq/data2/projects/prime_seq/barcodes-combined.txt") @@ -190,6 +193,7 @@ process align { path genome_dir path reference_fa path star_exe + path samtools_exe path barcode_whitelist tuple val(meta), path(fq1), path(fq2) @@ -237,7 +241,9 @@ process align { --outTmpDir "$tmpdir/STARSolo" \ & # Launch in background, so we can convert to cram from pipe - samtools sort \ + # TODO: Do not hardcode this! Currently, though + # if we run without hardcoding, it cannot find samtools + "./!{samtools_exe}" sort \ --reference "!{reference_fa}" \ --output-fmt-option "!{cram_fmt_options}" \ --threads "!{cpus}" \ @@ -335,7 +341,7 @@ process create_sample_configs { process merge_stats { scratch false - executor "local" + //executor "local" input: path("input.???.json") output: diff --git a/processes/altseq/bin/analyze.py b/processes/altseq/bin/analyze.py index 1cd192f2..31bd661f 100755 --- a/processes/altseq/bin/analyze.py +++ b/processes/altseq/bin/analyze.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import csv diff --git a/processes/altseq/process_altseq.bash b/processes/altseq/process_altseq.bash index 2f5066ef..a8b1c0f8 100644 --- a/processes/altseq/process_altseq.bash +++ b/processes/altseq/process_altseq.bash @@ -47,7 +47,7 @@ python "$STAMPIPES"/scripts/lims/create_altseq_sample_config.py processing.json SEQ_DIR=$(ls -d -1 ${SEQUENCER_MOUNT}/*$FLOWCELL* | head -n1) -GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome_2022_gencode.v39/ +GENOME_DIR=/net/seq/data2/projects/prime_seq/cell_ranger_ref/star_2.7.10_genome/ GENOME_FA=/net/seq/data2/projects/prime_seq/cell_ranger_ref/refdata-gex-GRCh38-2020-A/fasta/genome.fa BARCODE_WHITELIST=/net/seq/data2/projects/prime_seq/barcodes-combined.txt diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index 8648f9ea..f3e11c9a 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -467,6 +467,8 @@ def get_lane_ids(self, lane_id): def extract_id_from_url(url): return re.sub(r'[^\d]', "", url) lane_info = self.get_by_id("flowcell_lane", int(lane_id)) + logging.info("lane %s info:\n%s", lane_id, lane_info) + assert lane_info["library_pool"] is not None, "library_pool for lane %s must not be None" % lane_id pool_info = self.api.get_single_result(url=lane_info["library_pool"]) lib_ids = [] flowcell_id = extract_id_from_url(lane_info["flowcell"]) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 7a7b25a0..900877e1 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -140,7 +140,29 @@ fi # placeholder make_miniseq_samplesheet(){ -sleep 10 + name=Stamlab + date=$(date '+%m/%d/%Y') + cat <<__SHEET__ +[Header] +Investigator Name,$name +Project Name,$name +Experiment Name,$name +Date,$date +Workflow,GenerateFASTQ + +[Settings] + +[Data] +SampleID,SampleName,index,index2 +none,none,GGGGGGGG,GGGGGGGG +__SHEET__ + +if [ -z "$demux" ] ; then + # This bit of cryptic magic generates the samplesheet part. + jq -r '.libraries[] | select(.failed == false) | [.samplesheet_name,.samplesheet_name,.barcode_index,""] | join(",") ' "$json" \ + | sed 's/\([ACTG]\+\)-\([ACTG]\+\),$/\1,\2/' # Changes dual-index barcodes to proper format +fi + } # placeholder @@ -412,8 +434,7 @@ _U_ bc_flag="--miniseq" queue="queue0" minidemux="True" - # placeholder - cp /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv SampleSheet.csv + make_miniseq_samplesheet > SampleSheet.csv bcl_tasks=1 set +e read -d '' unaligned_command << _U_ @@ -436,6 +457,7 @@ _U_ minidemux="True" # placeholder cat /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv > SampleSheet.csv + #make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 set +e read -d '' unaligned_command << _U_ @@ -675,6 +697,7 @@ rsync -avP "$illumina_dir/InterOp" "$analysis_dir/" rsync -avP "$illumina_dir/RunInfo.xml" "$analysis_dir/" rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" + # Copy each sample by itself, checking to see if we have a project_share_directory set # This is very important to keep customer data separate from internal data. ( diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index cfcdf21e..6b61a468 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -638,7 +638,7 @@ def upload_barcode_report(self, barcode_file): if jsondata['Sequencer'] == 'MiniSeq': print(jsondata['BaseDir']) - flowcell_label = re.search( '.*_[AB]000([A-Z0-9]{6}).*$', jsondata['BaseDir'] ).group(1) + flowcell_label = re.search( '.*_[AB](000[A-Z0-9]{6}).*$', jsondata['BaseDir'] ).group(1) print(flowcell_label) else: From 37adb2994d145e48927ee8aac5f3145da5be9369 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 1 Feb 2024 13:11:11 -0800 Subject: [PATCH 117/172] Altcode alpha7: more lenient cell barcode strategy Also fixes an issue where malformed TALEN names would block processing. --- processes/altcode/altcode.nf | 3 ++- processes/altcode/process_altcode.bash | 2 +- scripts/poolprocess.py | 10 +++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index c887fbcb..937600a1 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -91,6 +91,7 @@ process STAR_solo { tuple(val(meta), path("Aligned.out.cram*"), emit: cram) tuple(val(meta), path("Solo.out/???**"), emit: solo_files) tuple(val(meta), path("Solo.out"), emit: solo_analysis) + tuple(val(meta), path("Log*"), emit: logs) script: @@ -119,7 +120,7 @@ process STAR_solo { --soloCBposition "${bc3_position}" "${bc2_position}" "${bc1_position}" \ --soloCBwhitelist "${r3_barcodes}" "${r2_barcodes}" "${r1_barcodes}" \ --soloUMIposition "${umi_position}" \ - --soloCBmatchWLtype 1MM \ + --soloCBmatchWLtype EditDist_2 \ --soloUMIdedup 1MM_All \ --soloFeatures Gene GeneFull SJ GeneFull_Ex50pAS GeneFull_ExonOverIntron \ --soloMultiMappers Unique PropUnique Uniform Rescue EM \ diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 7a5f0e70..973697d0 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha6 +version=1.0.0-alpha7 cd "$(dirname "$0")" # Temporarily hardcoded! diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 1cf92c22..8cac6b34 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -794,8 +794,12 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": def sort_talens(tls): """ Sort talens by number """ def get_num(tl): - assert tl[:2] == "TL" - return int(tl[2:]) + match = re.search(r"TL(\d+)", tl) + if match: + return int(match.group(1)) + else: + logging.warning("Weird talen: '%s'" % tl) + return 0 return sorted(tls, key=get_num) if pool_info: @@ -805,7 +809,7 @@ def get_num(tl): for effector in pool.get("effectors", []): if effector["talen"]: talen_names.append(effector["talen"]) - + talen_name = ",".join(sort_talens(talen_names)) orig_talen_name = talen_name From 0d4caaca1575e0286e7b77bf5eaa54d28ae1e0e1 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 11 Feb 2024 10:17:36 -0800 Subject: [PATCH 118/172] altcode - alphta8: add CB & UB output --- processes/altcode/altcode.nf | 18 +++++++++--------- processes/altcode/process_altcode.bash | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/processes/altcode/altcode.nf b/processes/altcode/altcode.nf index 937600a1..5ab4112a 100644 --- a/processes/altcode/altcode.nf +++ b/processes/altcode/altcode.nf @@ -126,25 +126,25 @@ process STAR_solo { --soloMultiMappers Unique PropUnique Uniform Rescue EM \ --runThreadN "${num_threads}" \ --limitBAMsortRAM "${bam_sort_RAM}" \ - --outSAMtype BAM Unsorted \ - --outSAMattributes NH HI AS nM CR CY UR UY sM \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattributes NH HI AS nM CR CY UR UY sM CB UB \ --outSAMunmapped Within \ - --outBAMcompression 0 \ + --outBAMcompression 1 \ --outBAMsortingThreadN "${num_threads}" \ --readFilesCommand zcat \ --outFileNamePrefix ./ \ --limitOutSJcollapsed 5000000 - samtools sort \ + samtools view \ --reference "${genome_fasta}" \ - -o Aligned.out.cram \ --output-fmt-option "version=3.0,level=7" \ --threads "${num_threads}" \ - --write-index \ - -T "tmpsort" \ - Aligned.out.bam + -o Aligned.out.cram \ + Aligned.sortedByCoord.out.bam + rm Aligned.sortedByCoord.out.bam + + samtools index -@ "${num_threads}" Aligned.out.cram - rm Aligned.out.bam compress_mtx_files.sh ./Solo.out "${num_threads}" """ } diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 973697d0..19694a0e 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0-alpha7 +version=1.0.0_alpha8 cd "$(dirname "$0")" # Temporarily hardcoded! From 80b4682da15a44cc9c0cb354a04ffe128fcd7cdb Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:11:57 -0700 Subject: [PATCH 119/172] Add support for NovaSeq X 1.5B --- scripts/flowcells/setup.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 900877e1..0f8d0217 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -349,6 +349,20 @@ case $run_type in bcl_tasks=1 unaligned_command=$novaseq_bcl_command +;; +"NovaSeq X 1.5B") + echo "NovaSeq X: 1.5B" + unset demux + parallel_env="-pe threads 6" + link_command=$novaseq_link_command + samplesheet="SampleSheet.csv" + fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! + bc_flag="--novaseq" + queue="hpcz-2" + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + bcl_tasks=1 + unaligned_command=$novaseq_bcl_command + ;; "Novaseq 6000 SP") echo "Novaseq 6000: SP (non-pooled)" From 936e36bdb7a10643060bf9ce5f71023dc305e7c9 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:15:33 -0700 Subject: [PATCH 120/172] Improve fastq linking for LP samples --- scripts/flowcells/link_nextseq.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 295119b2..0ee5171f 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -74,11 +74,18 @@ def create_links( If dry_run is passed, will print them instead of creating them """ - # Skip processing the lane if it's not getting aligned - if not lane.get("alignments"): - return False - sample_name = lane["alignments"][0]["sample_name"] short_name = lane["samplesheet_name"] + if lane.get("alignments"): + sample_name = lane["alignments"][0]["sample_name"] + else: + bc1 = lane["barcode1"]["sequence"] if lane.get("barcode1") else "" + bc2 = lane["barcode2"]["sequence"] if lane.get("barcode2") else "" + lane_num = int(lane["lane"]) + sample_name = "%s_%s_%s_L%03d" % (short_name, bc1, bc2, lane_num) + + + if lane.get("library_pool"): + is_pool = True if undetermined: output_dir = os.path.join( @@ -107,13 +114,14 @@ def create_links( # sense in our system) input_fastq = sorted(glob.glob(input_wildcard)) + logging.debug("Looking for %s", input_wildcard) for idx, input_file in enumerate(input_fastq, start=1): output_name = "%s_%s_%03d.fastq.gz" % (sample_name, read, idx) output_file = os.path.join(output_dir, output_name) rel_path = os.path.relpath(input_file, output_dir) - print("Linking %s => %s" % (rel_path, output_file)) + logging.info("Linking %s => %s" % (rel_path, output_file)) if not dry_run and not os.path.exists(output_file): os.symlink(rel_path, output_file) From 207502439cd7a115e33e9f1ae9aa02b9fc510f9c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:18:46 -0700 Subject: [PATCH 121/172] Collation works for LPs --- scripts/altcode/upload_fastq.py | 12 ++++++----- scripts/apilaneprocess.py | 35 ++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index f3e11c9a..b37199d2 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -481,12 +481,14 @@ def extract_id_from_url(url): lane_info["lane"], ) lane_info = self.get_list_result(lanes_query) - lanes_in_pool = [] + lanes_in_pool = set() + lanes_in_pool.add(int(lane_id)) for l in lane_info: - library_id = extract_id_from_url(l["library"]) - if library_id in lib_ids: - lanes_in_pool.append(l["id"]) - return lanes_in_pool + if l.get("library"): + library_id = extract_id_from_url(l["library"]) + if library_id in lib_ids: + lanes_in_pool.add(l["id"]) + return list(lanes_in_pool) #def upload_flowcell_report(self, data): diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 670bfdb3..da1e0490 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -156,19 +156,27 @@ def setup_lane(self, lane_id): processing_info = self.get_lane_process_info(lane_id) pool_name = None - try: - lib_number = processing_info["libraries"][0]["library"] - library_info = self.api.get_single_result(url_addition="library/?number=%d" % lib_number)["results"][0] - logging.debug("Info is %s", library_info) - pools = library_info["librarypools"] - if pools: - pool_name = pools[0]["object_name"] - pool_id = pools[0]["id"] - logging.debug("Lane %d is pool %s", lib_number, pool_name) - else: - logging.debug("Lane %d is not pool", lib_number) - except: - pass + + if (len(processing_info.get("libraries", [])) == 1 + and processing_info["libraries"][0].get("samplesheet_name",'').startswith("LP")): + pool_name = processing_info['libraries'][0]['samplesheet_name'] + pool_number = int(pool_name[2:]) # remove leading LP + pool_data = self.api.get_single_result(url_addition="library_pool/?number=%d" % pool_number)["results"][0] + pool_id = pool_data["id"] + else: + try: + lib_number = processing_info["libraries"][0]["library"] + library_info = self.api.get_single_result(url_addition="library/?number=%d" % lib_number)["results"][0] + logging.debug("Info is %s", library_info) + pools = library_info["librarypools"] + if pools: + pool_name = pools[0]["object_name"] + pool_id = pools[0]["id"] + logging.debug("Lane %d is pool %s", lib_number, pool_name) + else: + logging.debug("Lane %d is not pool", lib_number) + except: + pass global POOL_INFO if pool_name and pool_name not in POOL_INFO: @@ -215,7 +223,6 @@ def create_script(self, processing_info, pool=None): if not "directory" in lane: logging.critical("No directory for lane %d" % lane["id"]) return False - fastq_directory = lane["directory"] alt_dir = lane.get("project_share_directory", "") if alt_dir: From 327bf794d2c16be1229237e5b98a43099ce2eda8 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:22:15 -0700 Subject: [PATCH 122/172] poolprocess.py minimally works again --- scripts/poolprocess.py | 131 ++++++++++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 42 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 8cac6b34..42523860 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,5 +1,6 @@ #import csv import argparse +import functools import json import logging import os @@ -19,6 +20,8 @@ POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} +LANES_WITH_DIRECT_POOL = {} + LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -131,6 +134,7 @@ def __init__(self, args, api_url, token): self.pool = ThreadPoolExecutor(max_workers=10) + @functools.lru_cache(maxsize=None) def api_single_result(self, url_addition=None, url=None): if url_addition: @@ -146,6 +150,7 @@ def api_single_result(self, url_addition=None, url=None): logging.error(request) return None + @functools.lru_cache(maxsize=None) def api_list_result(self, url_addition=None, url=None): more = True @@ -257,7 +262,8 @@ def setup_flowcell(self, flowcell_label): logging.debug("align ids: %s", align_ids) #alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) - self.setup_alignments(align_ids) + # Disable parallelism so that caching works + self.setup_alignments(align_ids, parallel=False) self.add_stats_upload(flowcell_label) def get_alignment_ids(self, flowcell_label: str) -> [int]: @@ -268,12 +274,15 @@ def get_alignment_ids(self, flowcell_label: str) -> [int]: """ def extract_id_from_url(url): + if url is None: + return None return int(re.findall(r'\d+', url)[-1]) # Storage for the 3 layers of mapping between alignments and pools global POOL_KEY_TO_LIB_IDS global LIB_ID_TO_LANE_IDS global LANE_ID_TO_ALN_IDS + global LANES_WITH_DIRECT_POOL POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} @@ -283,9 +292,32 @@ def extract_id_from_url(url): for lane in self.api_list_result("flowcell_lane/?flowcell__label=%s&page_size=1000" % flowcell_label): lib_url = lane['library'] lane_lane = lane['lane'] - library_info.add((lib_url, lane_lane)) - lib_id = extract_id_from_url(lib_url) - LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) + if lib_url is not None: + library_info.add((lib_url, lane_lane)) + lib_id = extract_id_from_url(lib_url) + LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) + else: + # HACKS BELOW + # Get pool manually + pool_url = lane['library_pool'] + pool_id = extract_id_from_url(pool_url) + LANES_WITH_DIRECT_POOL[lane['id']] = pool_id + pool_key = (pool_id, lane_lane) + pool_number = int(lane['library_pool__number']) + + # Get Library info + lp_info = self.api_single_result(url=pool_url) + sl_info = self.api_single_result(url=lp_info['sublibrary']) + cl_info = self.api_single_result(url=sl_info['cell_library']) + lib_ids = [extract_id_from_url(lib_url) for lib_url in cl_info["libraries"]] + + for lib_url in cl_info["libraries"]: + library_info.add((lib_url, lane_lane)) + + for lib_id in lib_ids: + POOL_KEY_TO_LIB_IDS[pool_key].append(lib_id) + LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) + # Set of poolnums + lane pool_info = set() @@ -297,9 +329,14 @@ def extract_id_from_url(url): POOL_KEY_TO_LIB_IDS[key].append(lib_info['id']) all_alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + + direct_alns = set() for aln in all_alignments: lane_id = extract_id_from_url(aln['lane']) LANE_ID_TO_ALN_IDS[lane_id].append(aln['id']) + if lane_id in LANES_WITH_DIRECT_POOL: + direct_alns.add(aln['id']) + # Find the minimum alignment ID for each pool/lane combination lowest_aln_for_pool = {pool_key: None for pool_key in POOL_KEY_TO_LIB_IDS.keys()} @@ -313,11 +350,12 @@ def extract_id_from_url(url): if cur_aln is None or cur_aln > aln_id: lowest_aln_for_pool[pool_key] = aln_id + align_ids = set(lowest_aln_for_pool.values()).union( direct_alns ) logging.debug("POOL_KEY_TO_LIB_IDS %s", POOL_KEY_TO_LIB_IDS) logging.debug("LIB_ID_TO_LANE_IDS %s", LIB_ID_TO_LANE_IDS) logging.debug("LANE_ID_TO_ALN_IDS %s", LANE_ID_TO_ALN_IDS) - logging.debug("ALN IDS %s", lowest_aln_for_pool.values()) - return list(lowest_aln_for_pool.values()) + logging.debug("ALN IDS %s", align_ids) + return list(align_ids) @@ -420,12 +458,15 @@ def create_script(self, processing_info, align_id): logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) return False - lib_info_response = self.api_single_result("library/?number=%d" % lane["library"])["results"] - assert len(lib_info_response) == 1 - lib_info = lib_info_response[0] - logging.debug("lib info is %s", lib_info) - pool_name = lib_info["librarypools"][0]["object_name"] - logging.debug("pool is %s", pool_name) + if lane.get('library'): + lib_info_response = self.api_single_result("library/?number=%d" % int(lane["library"]))["results"] + assert len(lib_info_response) == 1 + lib_info = lib_info_response[0] + logging.debug("lib info is %s", lib_info) + pool_name = lib_info["librarypools"][0]["object_name"] + logging.debug("pool is %s", pool_name) + else: + pool_name = lane['samplesheet_name'] fastq_directory = os.path.join(flowcell_directory, "Project_%s" % lane['project'], "LibraryPool_%s" % pool_name) @@ -465,9 +506,9 @@ def create_script(self, processing_info, align_id): env_vars["GENOME"] = alignment['genome_index'] env_vars["ASSAY"] = lane['assay'] env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] - if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0]['library_kit_method']: + try: env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' - else: + except: env_vars["LIBRARY_KIT"] = None if processing_info['flowcell']['paired_end']: @@ -575,8 +616,8 @@ def get_libraries_in_pool(alignment_id): libs_with_align = set() for (lib_id, lane_ids) in LIB_ID_TO_LANE_IDS.items(): if align_lane_id in lane_ids: - libs_with_align.add(lib_id), "Lane must have exactly 1 library" - assert len(libs_with_align) == 1 + libs_with_align.add(lib_id) + #assert len(libs_with_align) == 1, "Lane must have exactly 1 library" align_lib_id = libs_with_align.pop() pools_with_align = set() @@ -584,7 +625,7 @@ def get_libraries_in_pool(alignment_id): if align_lib_id in lib_ids: pools_with_align.add(pool_key) # TODO: This is broken because the pool can be in more than one lane!!! - assert len(pools_with_align) == 1, "Lib must have exactly one pool" + #assert len(pools_with_align) == 1, "Lib must have exactly one pool" align_poolkey = pools_with_align.pop() logging.debug("Alignment ALN%d - poolkey %s", alignment_id, align_poolkey) @@ -611,8 +652,7 @@ def add_error(fmt, *args): barcode += bc2 sample_info = self.api_single_result(url=lib_info["sample"]) - tc_info = self.api_single_result(url=sample_info["tissue_culture"]) - project_info = self.api_single_result(url=sample_info["project"]) + project_info = self.api_single_result(url=sample_info["project"]) if sample_info.get("project") else {"name": None} taggedobject_infos = self.api_list_result("tagged_object/?object_id=%d&content_type=%d" % (lib_info["id"], lib_info["object_content_type"])) @@ -669,29 +709,35 @@ def build_effector_info(effectortopool): } pool_info = [] - for effector_pool in tc_info["effector_pools"]: - effector_pool_info = self.api_single_result(url=effector_pool["url"]) - loci_info = [] - if effector_pool_info.get("loci", False): - for locus_url in effector_pool_info["loci"]: - locus_info = self.api_single_result(url=locus_url) - locus_dict = { - "label": locus_info.get("object_label"), - } - for key in LOCUS_KEYS: - locus_dict[key] = locus_info.get(key, None) - loci_info.append(locus_dict) - - pool_info.append({ - "effector_pool": effector_pool_info["object_name"], - "name": effector_pool_info["name"], - "purpose": effector_pool_info["purpose__name"], - "loci": loci_info, - "effectors": [ - build_effector_info(efftopool) - for efftopool in effector_pool_info["effectortopool_set"] - ], - }) + # Dummy info in case we can't get it from LIMS + tc_info = {"notes": "", "number": 0, "sample_taxonomy__name": ""} + try: + tc_info = self.api_single_result(url=sample_info["tissue_culture"]) + for effector_pool in tc_info["effector_pools"]: + effector_pool_info = self.api_single_result(url=effector_pool["url"]) + loci_info = [] + if effector_pool_info.get("loci", False): + for locus_url in effector_pool_info["loci"]: + locus_info = self.api_single_result(url=locus_url) + locus_dict = { + "label": locus_info.get("object_label"), + } + for key in LOCUS_KEYS: + locus_dict[key] = locus_info.get(key, None) + loci_info.append(locus_dict) + + pool_info.append({ + "effector_pool": effector_pool_info["object_name"], + "name": effector_pool_info["name"], + "purpose": effector_pool_info["purpose__name"], + "loci": loci_info, + "effectors": [ + build_effector_info(efftopool) + for efftopool in effector_pool_info["effectortopool_set"] + ], + }) + except: + add_error("Could not get effector information for sample DS%s", sample_info['number']) def extract_lenti_from_tc_notes(notes): def match_notes(regex): @@ -885,6 +931,7 @@ def get_num(tl): flowcell_label = "FC%s" % processing_info["flowcell"]["label"] libraries = [] + for lib_id in lib_ids: libraries.append(build_library_info(lib_id, flowcell_label)) From 256fa527507136aaabb496d62c9c6ca78c7f536c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:23:07 -0700 Subject: [PATCH 123/172] upload_data - improve flowcell regex detection --- scripts/lims/upload_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 6b61a468..c33bb609 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -643,7 +643,7 @@ def upload_barcode_report(self, barcode_file): else: # make this more flexible eventually - flowcell_label = re.search( '.*_[AB]([A-Z0-9]{7}X[A-Z0-9])$', jsondata['BaseDir'] ).group(1) + flowcell_label = re.search( '.*_[AB]([A-Z0-9]{9})$', jsondata['BaseDir'] ).group(1) flowcell_url = self.get_flowcell_url_by_label(flowcell_label) From 0815e511b8566df7cf0b193c32d04613e3fd6db8 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:23:48 -0700 Subject: [PATCH 124/172] fix barcode count for masks with fewer lanes --- scripts/flowcells/barcode_count_from_stats_file.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py index 00e2933c..b14ba3c3 100644 --- a/scripts/flowcells/barcode_count_from_stats_file.py +++ b/scripts/flowcells/barcode_count_from_stats_file.py @@ -58,15 +58,22 @@ def main(): for conversion_result in idata["ConversionResults"]: lane_num = conversion_result["LaneNumber"] + lane_idx = None + for (i, olane) in enumerate(odata["Lanes"]): + if int(olane["LaneIndex"]) == int(lane_num): + lane_idx = i + break + if lane_idx is None: + logging.error("Lane %s not in odata", lane_num) for sample_info in conversion_result["DemuxResults"]: for metric_info in sample_info["IndexMetrics"]: # Get matching count barcode = metric_info["IndexSequence"].replace("+","") count = metric_info["MismatchCounts"]["0"] # Update out_data - odata["Lanes"][lane_num-1]["Counts"][barcode] = {"Total": count, "Pass": count} - odata["Lanes"][lane_num-1]["Total"] += count - odata["Lanes"][lane_num-1]["Pass"] += count + odata["Lanes"][lane_idx]["Counts"][barcode] = {"Total": count, "Pass": count} + odata["Lanes"][lane_idx]["Total"] += count + odata["Lanes"][lane_idx]["Pass"] += count print(json.dumps(odata)) From d89a9951a90f1f295df4ca467df589706fe1a72f Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 12 Mar 2024 11:25:31 -0700 Subject: [PATCH 125/172] altcode - 1.0 relase --- processes/altcode/process_altcode.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 19694a0e..76ec6c1e 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -1,7 +1,7 @@ #!/bin/bash set -eo pipefail -version=1.0.0_alpha8 +version=1.0.0 cd "$(dirname "$0")" # Temporarily hardcoded! From 23a799c589ad8708c33eb8ea0d25f8c00baa686b Mon Sep 17 00:00:00 2001 From: Audra Johnson Date: Sun, 24 Mar 2024 08:27:59 -0700 Subject: [PATCH 126/172] Support Novaseq 10B --- scripts/flowcells/setup.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 0f8d0217..253b60db 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -364,6 +364,21 @@ case $run_type in unaligned_command=$novaseq_bcl_command ;; +"NovaSeq X 10B") + echo "NovaSeq X: 10B" + unset demux + parallel_env="-pe threads 6" + link_command=$novaseq_link_command + samplesheet="SampleSheet.csv" + fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! + bc_flag="--novaseq" + queue="hpcz-2" + python "python $STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + bcl_tasks=1 + unaligned_command=$novaseq_bcl_command + +;; + "Novaseq 6000 SP") echo "Novaseq 6000: SP (non-pooled)" unset demux From 87b5cd80467edaf7d043580856f9934cc4745483 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 24 Mar 2024 08:29:05 -0700 Subject: [PATCH 127/172] Fix barcode linking for LPs --- scripts/flowcells/link_nextseq.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 0ee5171f..91f92ee4 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -78,10 +78,17 @@ def create_links( if lane.get("alignments"): sample_name = lane["alignments"][0]["sample_name"] else: - bc1 = lane["barcode1"]["sequence"] if lane.get("barcode1") else "" - bc2 = lane["barcode2"]["sequence"] if lane.get("barcode2") else "" + bc1 = lane["barcode1"]["reverse_sequence"] if lane.get("barcode1") else "" + bc2 = lane["barcode2"]["reverse_sequence"] if lane.get("barcode2") else "" lane_num = int(lane["lane"]) - sample_name = "%s_%s_%s_L%03d" % (short_name, bc1, bc2, lane_num) + if bc1 and bc2: + sample_name = "%s_%s_%s_L%03d" % (short_name, bc1, bc2, lane_num) + elif bc1: + sample_name = "%s_%s_L%03d" % (short_name, bc1, lane_num) + elif bc2: + sample_name = "%s_%s_L%03d" % (short_name, bc2, lane_num) + else: + sample_name = "%s_L%03d" % (short_name, lane_num) if lane.get("library_pool"): From b4b6fbcac7b359205accf91519c9bea1c13bee97 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 24 Mar 2024 08:37:46 -0700 Subject: [PATCH 128/172] Simplify make_samplesheets.py with new pool system --- scripts/flowcells/make_samplesheets.py | 69 ++++++-------------------- 1 file changed, 16 insertions(+), 53 deletions(-) diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index d2fa8306..eea44fa9 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -39,64 +39,27 @@ def parser_setup(): def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2: bool) -> "[dict]": assignments = [] - # Initialize our library pool lookup tables - pools = data["library_pools"] - libs_to_pools = dict() - for (pool_name, pool_data) in pools.items(): - for lib_str in pool_data["libraries"]: - lib_num = int(re.sub("[A-Z]", "", lib_str)) - if lib_num in libs_to_pools: - raise Exception("library {} in more than one pool".format(lib_str)) - libs_to_pools[lib_num] = (pool_name, - pool_data.get("barcode1"), - pool_data.get("barcode2")) # This will store our pool samplesheet lines pool_assignment_set = set() for libdata in data["libraries"]: - # Skip libraries in pools - lib_num = libdata.get('library') - pool_data = libs_to_pools.get(lib_num) - if pool_data is None: - assignment = { - "lane": libdata.get("lane"), - "sample": libdata.get("samplesheet_name"), - "barcode1": "", - "barcode2": "", - } - if libdata.get("barcode1") is not None: - assignment["barcode1"] = libdata["barcode1"]["reverse_sequence"] if reverse_barcode1 else libdata["barcode1"]["sequence"] - if libdata.get("barcode2") is not None: - assignment["barcode2"] = libdata["barcode2"]["reverse_sequence"] if reverse_barcode2 else libdata["barcode2"]["sequence"] - - assignments.append(assignment) - else: - pool_assignment_set.add( - (libdata.get("lane"), *libs_to_pools[lib_num]) - ) - - # a quick little inner function to reverse complement - # a sequence and return the string of that - def reverse_complement(sequence: str) -> str: - seq = Seq(sequence) - return str(seq.reverse_complement()) - - # Turn set of tuples into list of dicts - pool_assignments = [{ - "lane": a[0], - "sample": a[1], - # Okay so we're trying to do the same with these as we do with - # the library barcodes including following the reverse instructions - # and these come reversed in the processing.json - # eventually we might want to change the processing.json to have both versions at hand - # like we do for libraries - # and remove the dependency on biopython - "barcode1": a[2] if reverse_barcode1 else reverse_complement(a[2]), - "barcode2": a[3] if reverse_barcode2 else reverse_complement(a[3]), - } for a in pool_assignment_set] - - return assignments + pool_assignments + assignment = { + "lane": libdata.get("lane"), + "sample": libdata.get("samplesheet_name"), + "barcode1": "", + "barcode2": "", + } + if assignment["sample"] == "None": + assignment["sample"] = "LANE%d" % libdata["id"] + if libdata.get("barcode1") is not None: + assignment["barcode1"] = libdata["barcode1"]["reverse_sequence"] if reverse_barcode1 else libdata["barcode1"]["sequence"] + if libdata.get("barcode2") is not None: + assignment["barcode2"] = libdata["barcode2"]["reverse_sequence"] if reverse_barcode2 else libdata["barcode2"]["sequence"] + + assignments.append(assignment) + + return assignments def make_samplesheet_header(name: str, date: str) -> str: From 3cd1587e43b9f598654ba53f1c8a2c11b0c46e3d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 8 Apr 2024 14:30:03 -0700 Subject: [PATCH 129/172] Altcode pipeline - some more completion checks --- processes/altcode/process_altcode.bash | 46 +++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/processes/altcode/process_altcode.bash b/processes/altcode/process_altcode.bash index 76ec6c1e..f1c541ac 100755 --- a/processes/altcode/process_altcode.bash +++ b/processes/altcode/process_altcode.bash @@ -70,6 +70,13 @@ umi_pos: 0 umi_len: 10 PARAMS_YAML +# Let LIMS know the alignment is starting +python3 "$STAMPIPES/scripts/lims/upload_data.py" \ + --api "$LIMS_API_URL" \ + --token "$LIMS_API_TOKEN" \ + --alignment_id "$ALIGNMENT_ID" \ + --start_alignment_progress + # Run :) nextflow run \ "$STAMPIPES/processes/altcode/altcode.nf" \ @@ -80,11 +87,37 @@ nextflow run \ -profile cluster \ -resume -## Upload fastq metadata -#python "$STAMPIPES/scripts/altseq/upload_data.py" \ - #"$sample_config" \ - #processing.json \ - #--output_file_directory "$outdir" + +require_file(){ + if ! [[ -s "$1" ]] ; then + echo "ERROR: File '$1' does not exist or is zero-size. Alignment did not complete successfully." + exit 1 + fi +} + +# Verify that output files exist +require_file "$outdir/Aligned.out.cram" +require_file "$outdir/Aligned.out.cram.crai" +require_file "$outdir/Solo.out/Barcodes.stats" + +samtools quickcheck "$outdir/Aligned.out.cram" +for d in Gene GeneFull GeneFull_Ex50pAS GeneFull_ExonOverIntron ; do + for statsfile in CellReads.stats Features.stats Summary.csv UMIperCellSorted.txt ; do + require_file "$outdir/Solo.out/$d/$statsfile" + done + for mtx in matrix UniqueAndMult-EM UniqueAndMult-PropUnique UniqueAndMult-Rescue UniqueAndMult-Uniform ; do + require_file "$outdir/Solo.out/$d/raw/$mtx.h5ad" + done + +done + +# Mark as completed in LIMS +python3 "$STAMPIPES/scripts/lims/upload_data.py" \ + --api "$LIMS_API_URL" \ + --token "$LIMS_API_TOKEN" \ + --alignment_id "$ALIGNMENT_ID" \ + --finish_alignment + # Create sentinel/status file if [[ -e "$status_file" ]] ; then @@ -92,9 +125,6 @@ if [[ -e "$status_file" ]] ; then old_status_file=${status_file/json/$old_date}.json mv "$status_file" "$old_status_file" fi - -samtools quickcheck "$outdir/Aligned.out.cram" - # TODO: What else do we want to capture here? It would be nice to at least # capture the command used and relevant env vars echo | jq . > "$status_file" < Date: Mon, 8 Apr 2024 14:47:31 -0700 Subject: [PATCH 130/172] upload_data.py: switch to bulk-count upload endpoint --- scripts/lims/upload_data.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index c33bb609..0daec8a8 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -668,9 +668,13 @@ def bulk_upload_counts(self, alignment_id, stats): def upload_counts(self, alignment_id, counts_file): parsed = self.parse_counts(counts_file) - #response = self.bulk_upload_counts(alignment_id, self.parse_counts(counts_file)) - #if response is None: - #log.error("Upload failed: Counts file {} for ALN{}".format(counts_file, alignment_id)) + response = self.bulk_upload_counts(alignment_id, self.parse_counts(counts_file)) + if response is None: + log.error("Bulk upload failed: Counts file {} for ALN{}".format(counts_file, alignment_id)) + else: + log.info("Upload successful.") + return + # TODO: Remove below code #log.warn("Counts: %s", self.get_list_result( # 'flowcell_lane_count/', query={"alignment":alignment_id} From 74ef1a5c007e6889bd6a00b39670bdeeaaf331c2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 8 Apr 2024 14:49:02 -0700 Subject: [PATCH 131/172] altcode: stringify NaNs in stat upload --- processes/altcode/bin/summarize_stats.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index 9302c105..8f29c38b 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -4,6 +4,7 @@ import csv import json import logging +import math import os import pathlib import pprint @@ -44,11 +45,19 @@ def parse_summary_stats(filename): with open(filename) as f: data = {} for line in f: - (key, val) = line.strip().split(",") + (key, orig_val) = line.strip().split(",") + val = orig_val try: val = int(val) except ValueError: - val = float(val) + try: + val = float(val) + if math.isnan(val) or math.isinf(val): + # If NaN or Inf, leave as string + # because the python json module fucks those up + val = orig_val + except ValueError: + pass data[key] = val return data From 4871c83c206649fff968ccec79921dd2bc750483 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 8 Apr 2024 14:53:58 -0700 Subject: [PATCH 132/172] alignprocess.py: don't crash if library kit method missing --- scripts/alignprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 3c32b48e..82f40b71 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -352,7 +352,7 @@ def create_script(self, processing_info, align_id): env_vars["GENOME"] = alignment['genome_index'] env_vars["ASSAY"] = lane['assay'] env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] - if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0]['library_kit_method']: + if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0].get('library_kit_method'): env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' else: env_vars["LIBRARY_KIT"] = None From 209cdf5ed597a91f1c53f43ae28bed83eaf5f378 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 8 Apr 2024 14:55:39 -0700 Subject: [PATCH 133/172] Fix - vs _ confusion in fastq file names --- scripts/apilaneprocess.py | 2 +- scripts/flowcells/link_nextseq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index da1e0490..d2be1164 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -187,7 +187,7 @@ def setup_lane(self, lane_id): bc1 = self.api.get_single_result(url=pool_data["barcode1"])["reverse_sequence"] if pool_data["barcode2"]: bc2 = self.api.get_single_result(url=pool_data["barcode2"])["reverse_sequence"] - barcode = "_".join(bc for bc in [bc1, bc2] if bc) + barcode = "-".join(bc for bc in [bc1, bc2] if bc) POOL_INFO[pool_name] = {"barcode": barcode} self.create_script(processing_info, pool_name) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 91f92ee4..c0adb315 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -82,7 +82,7 @@ def create_links( bc2 = lane["barcode2"]["reverse_sequence"] if lane.get("barcode2") else "" lane_num = int(lane["lane"]) if bc1 and bc2: - sample_name = "%s_%s_%s_L%03d" % (short_name, bc1, bc2, lane_num) + sample_name = "%s_%s-%s_L%03d" % (short_name, bc1, bc2, lane_num) elif bc1: sample_name = "%s_%s_L%03d" % (short_name, bc1, lane_num) elif bc2: From aa20f52c19192c8458d13fe9d5b6685b8f181c70 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 8 Apr 2024 14:58:00 -0700 Subject: [PATCH 134/172] poolprocess.py - keep going even if errors encountered --- scripts/poolprocess.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 42523860..46ef62dc 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -205,6 +205,7 @@ def get_process_template(self, align_id, process_template_id): # Run alignment setup in parallel def setup_alignments(self, align_ids, parallel=True): all_okay = True + logging.info("Setting up alignments: %s", align_ids) if parallel: for id, error in self.pool.map(self.setup_alignment, align_ids): if error: @@ -213,7 +214,8 @@ def setup_alignments(self, align_ids, parallel=True): else: logging.debug("ALN%d result received, OK" % id) if not all_okay: - logging.critical("Errors during setup, exiting") + #logging.critical("Errors during setup, exiting") + logging.error("Errors during setup, but continuing with other alignments") # Sequential version, helpful for debugging else: for aln_id in align_ids: @@ -232,7 +234,7 @@ def setup_alignment(self, align_id): logging.info("Skipping completed alignment %d" % align_id) return (align_id, None) except Exception as e: - logging.exception("Could not set up alignment %d}: (%s)" % (align_id, e)) + logging.exception("Could not set up alignment %s}: (%s)" % (align_id, e)) return (align_id, e) def get_lane_file(self, lane_id, purpose): From 60a6d295b203f37fcdb7f3de58c9559ee5c0c873 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 9 Apr 2024 09:19:16 -0700 Subject: [PATCH 135/172] poolprocess.py - add sublibrary and cell_library --- scripts/poolprocess.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 46ef62dc..c94415d6 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -312,7 +312,7 @@ def extract_id_from_url(url): sl_info = self.api_single_result(url=lp_info['sublibrary']) cl_info = self.api_single_result(url=sl_info['cell_library']) lib_ids = [extract_id_from_url(lib_url) for lib_url in cl_info["libraries"]] - + for lib_url in cl_info["libraries"]: library_info.add((lib_url, lane_lane)) @@ -338,7 +338,7 @@ def extract_id_from_url(url): LANE_ID_TO_ALN_IDS[lane_id].append(aln['id']) if lane_id in LANES_WITH_DIRECT_POOL: direct_alns.add(aln['id']) - + # Find the minimum alignment ID for each pool/lane combination lowest_aln_for_pool = {pool_key: None for pool_key in POOL_KEY_TO_LIB_IDS.keys()} @@ -569,6 +569,7 @@ def create_script(self, processing_info, align_id): if self.dry_run: logging.info("Dry run, would have created: %s" % script_file) logging.debug(env_vars) + self.create_sample_config(processing_info, alignment, script_directory, pool_name) return True if not os.path.exists(script_directory): @@ -902,6 +903,28 @@ def get_num(tl): # (talen_orig, talen_new) = parse_talen_names_from_tc_notes(tc_info["notes"]) + def get_sbl_and_cl(pool_name): + (sbl, cl) = (None, None) + try: + m = re.match(r"LP(\d+)", pool_name) + if not m: + add_error("Pool name '%s' not valid, can't get SBL&CL", pool_name) + return (None, None) + pool_id = int(m.group(1)) + pool_info = self.api_list_result("library_pool/?number=%d" % pool_id)[0] + if not pool_info.get("sublibrary"): + return (None, None) + sbl_info = self.api_single_result(url=pool_info.get("sublibrary")) + sbl = sbl_info["object_name"] + if not sbl_info.get("cell_library"): + return (sbl, None) + cl_info = self.api_single_result(url=sbl_info.get("cell_library")) + cl = cl_info.get("object_name") + except Exception as e: + add_error("Error finding SBL or CL: %s", e) + return (sbl, cl) + (sbl_name, cl_name) = get_sbl_and_cl(pool_name) + info = { "sequencing_barcode_well": seq_well_label, "sequencing_barcode_plate": seq_well_plate, @@ -916,6 +939,8 @@ def get_num(tl): "tale_target_name": "TODO", "tale_target_master_gene_id": "TODO", "effector_purpose": "TODO", + "cell_library": cl_name, + "sublibrary": sbl_name, "library_pool": pool_name, "TC#": "TC%d" % tc_info["number"], "DS#": "DS%d" % sample_info["number"], @@ -933,11 +958,14 @@ def get_num(tl): flowcell_label = "FC%s" % processing_info["flowcell"]["label"] libraries = [] - + for lib_id in lib_ids: libraries.append(build_library_info(lib_id, flowcell_label)) data = {"libraries": libraries} + if self.dry_run: + logging.info("dry_run, would have written %s/pool_info.json", script_directory) + return # do stuff with open("%s/pool_info.json" % script_directory, "w") as out: json.dump(data, out, indent=2, sort_keys=True) From 3ed1b27f1878085319cd1514252b7a94899854c5 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 23 Apr 2024 13:35:22 -0700 Subject: [PATCH 136/172] Fix mistaken lane merging/duplication --- scripts/flowcells/link_nextseq.py | 19 +++++++++++++------ scripts/flowcells/setup.sh | 10 +++++----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index c0adb315..f9aedfe9 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -52,6 +52,11 @@ def parser_setup(): dest="processing_file", help="The processing_file to use as a guide.", ) + parser.add_argument( + "--merge-across-lanes", + action="store_true", + help="Merge across physical flowcell lanes. (Recommended for nextseq, not for Novaseq)", + ) parser.add_argument( "--dry-run", @@ -67,7 +72,7 @@ def parser_setup(): def create_links( - lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False, is_pool=False, + lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False, is_pool=False, merge_across_lanes=False, ): """ Create the links between the input directories and output dir @@ -107,8 +112,9 @@ def create_links( ) short_name = re.sub(r"_", "-", short_name) + lane_lane = "*" if (merge_across_lanes or "lane" not in lane) else "_L%03d" % int(lane["lane"]) input_wildcard = os.path.join( - input_basedir, "%s_S*_%s_???.fastq.gz" % (short_name, read) + input_basedir, "%s_S*%s_%s_???.fastq.gz" % (short_name, lane_lane, read) ) if not dry_run and not os.path.isdir(output_dir): @@ -153,8 +159,8 @@ def main(): data = json.loads(open(poptions.processing_file, "r").read()) for lane in data["libraries"]: - create_links(lane, "R1", input_dir, poptions.output_dir, poptions.dry_run) - create_links(lane, "R2", input_dir, poptions.output_dir, poptions.dry_run) + create_links(lane, "R1", input_dir, poptions.output_dir, poptions.dry_run, merge_across_lanes=poptions.merge_across_lanes) + create_links(lane, "R2", input_dir, poptions.output_dir, poptions.dry_run, merge_across_lanes=poptions.merge_across_lanes) undet_lane = { "alignments": [{"sample_name": "lane1_Undetermined_L001"}], @@ -162,7 +168,8 @@ def main(): } for read in ["R1", "R2"]: create_links( - undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, undetermined=True + undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, undetermined=True, + merge_across_lanes=poptions.merge_across_lanes, ) # Set up conversion table @@ -188,7 +195,7 @@ def main(): } for read in ["R1", "R2"]: create_links( - lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True + lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True, merge_across_lanes=poptions.merge_across_lanes, ) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 253b60db..f2ef52ed 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -397,7 +397,7 @@ case $run_type in echo "Regular NextSeq 500 run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--nextseq" @@ -422,7 +422,7 @@ case $run_type in # Identical to nextseq processing echo "High-output MiniSeq run detected for DNase" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -435,7 +435,7 @@ case $run_type in # Identical to nextseq processing echo "Mid-output MiniSeq run detected for GUIDEseq" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -457,7 +457,7 @@ _U_ # Identical to nextseq processing echo "Mid-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -478,7 +478,7 @@ _U_ # Identical to nextseq processing echo "High-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" From 0b0ebda3f858a45e102e515b8d5d50c995f524cb Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 23 Apr 2024 13:37:38 -0700 Subject: [PATCH 137/172] NovaSeq 10B fix: backport from prod --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index f2ef52ed..e4e1a7bc 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -373,7 +373,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="hpcz-2" - python "python $STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 unaligned_command=$novaseq_bcl_command From 4491a790f3a4b311403ea5e71a1b8b4bd4c1bdac Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 21 May 2024 12:54:29 -0700 Subject: [PATCH 138/172] Add support for Novaseq X 25B --- scripts/flowcells/setup.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index e4e1a7bc..203fecc4 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -377,6 +377,20 @@ case $run_type in bcl_tasks=1 unaligned_command=$novaseq_bcl_command +;; +"NovaSeq X 25B") + echo "NovaSeq X: 25B" + unset demux + parallel_env="-pe threads 6" + link_command=$novaseq_link_command + samplesheet="SampleSheet.csv" + fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! + bc_flag="--novaseq" + queue="hpcz-2" + python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + bcl_tasks=1 + unaligned_command=$novaseq_bcl_command + ;; "Novaseq 6000 SP") From e4c3b93e5473c806e742dd42d898104a3f61c82d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 3 Jun 2024 09:23:05 -0700 Subject: [PATCH 139/172] upload_data.py: wrap bulk_upload stats in list It works now. :) --- scripts/lims/upload_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 0daec8a8..4f60af32 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -658,11 +658,11 @@ def upload_barcode_report(self, barcode_file): def bulk_upload_counts(self, alignment_id, stats): # TODO: This isn't ready yet. - data = { + data = [{ "object_id": alignment_id, "content_type": "flowcelllanealignment", "stats": stats, - } + }] response = self.api.post_single_result(url_addition="stat/create", json=data) return response From ffba535a4da226a462e68c46d5e7d38c402ed989 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 9 Jun 2024 09:01:53 -0700 Subject: [PATCH 140/172] Fix for NovaSeq run type --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 203fecc4..95060b28 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -202,7 +202,7 @@ run_type=$( jq -r '.flowcell.run_type' "$json" ) has_umi=$( jq -r '.libraries | map(.barcode1.umi) | any' "$json") # Novaseq runs always use native bcl2fastq demuxing -if [[ $run_type =~ Novaseq ]] ; then +if [[ $run_type =~ Novaseq ]] || [[ $run_type =~ NovaSeq ]] ; then unset demux fi From b4ed09f9d5c857af29e10ca9adc92644603eff12 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 13 Jun 2024 11:30:38 -0700 Subject: [PATCH 141/172] demuxing novaseq: process at lane/samplesheet level This change speeds up processing significantly - we run one job for each Samplesheet.csv / lane combination, rather than one big long job. We also give each job 40 cores, instead of 20. --- scripts/flowcells/setup.sh | 104 +++++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 23 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 95060b28..e8d297ee 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -291,6 +291,50 @@ read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ done _NOVA_BCL_CMD_ +# TODO: Remove hardcoded queue here! +# The issue that that 'queue' isn't set until later in the script, but is needed for NOVA_SUBMIT_CMD +queue=hpcz-2 + +# This is a variant where we submit one job for each lane +read -d '' novaseq_submit_command <<_NOVA_SUBMIT_CMD_ +# Run bcl2fastq in parallel, for each samplesheet and lane +PROCESSING= +for samplesheet in SampleSheet.withmask*csv ; do + for lane in {1..8} ; do + # TODO: Skip submission if lane not in this samplesheet + if ! (cut -d, -f1 \$samplesheet | sort -u | grep -q \$lane) ; then + echo "Lane \$lane not in samplesheet \$samplesheet, skipping" + continue + fi + bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) + fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask-lane-00\$lane") + jobname=u-$flowcell-\$bcl_mask-L00\$lane + bcl_jobid=\$(sbatch --export=ALL -J "\$jobname" -o "\$jobname.o%A" -e "\$jobname.e%A" --partition=$queue --ntasks=1 --cpus-per-task=40 --mem-per-cpu=8000 --parsable --oversubscribe <<__FASTQ__ +#!/bin/bash + set -x -e -o pipefail + cd "${illumina_dir}" + PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH + bcl2fastq \\\\ + --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ + --output-dir "${illumina_dir}/\\\$fastq_dir" \\\\ + --use-bases-mask "\\\$bcl_mask" \\\\ + --tiles "s_\\\$lane" \\\\ + --barcode-mismatches "$mismatches" \\\\ + --sample-sheet "${illumina_dir}/\$samplesheet" \\\\ + --writing-threads 0 \\\\ + --loading-threads \\\\\$SLURM_CPUS_PER_TASK \\\\ + --processing-threads \\\\\$SLURM_CPUS_PER_TASK +__FASTQ__ +) + PROCESSING="\$PROCESSING,\$bcl_jobid" + done +done +if [[ -n "\$PROCESSING" ]]; then + bcl_dependency=\$(echo \$PROCESSING | sed -e 's/,/,afterok:/g' | sed -e 's/^,afterok/--dependency=afterok/g') +fi + +_NOVA_SUBMIT_CMD_ + read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' for fq_dir in fastq-withmask-* ; do [[ -d $fq_dir ]] || continue @@ -319,7 +363,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; "Novaseq 6000 S2") @@ -333,7 +378,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; "Novaseq 6000 S4") @@ -347,7 +393,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; "NovaSeq X 1.5B") @@ -361,7 +408,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; "NovaSeq X 10B") @@ -375,7 +423,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; "NovaSeq X 25B") @@ -389,7 +438,8 @@ case $run_type in queue="hpcz-2" python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 - unaligned_command=$novaseq_bcl_command + #unaligned_command=$novaseq_bcl_command + submit_bcl2fastq_cmd=$novaseq_submit_command ;; @@ -568,7 +618,7 @@ flowcell_id=$( curl \ # The final script is below: if [[ -n "$minidemux" ]]; then - + # If miniseq, demux flowcell with fixed/known barcodes. cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" @@ -615,7 +665,29 @@ __FASTQ__ __BCL2FASTQ__ -else +else # If not miniseq + +# Default (slow) bcl2fastq cmd +if [[ -z "$submit_bcl2fastq_cmd" ]] ; then + # If we haven't created the submit command yet, wrap up the unaligned_command + # This is the "old" way of submitting one job that does the whole flowcell + submit_bcl2fastq_cmd=<<__SUBMIT_BCL2FASTQ_CMD__ +# bcl2fastq +bcl_jobid=\$(sbatch --export=ALL -J "u-$flowcell" -o "u-$flowcell.o%A" -e "u-$flowcell.e%A" --partition=$queue --ntasks=1 --cpus-per-task=20 --mem-per-cpu=8000 --parsable --oversubscribe <<'__FASTQ__' +#!/bin/bash +set -x -e -o pipefail +cd "$illumina_dir" + +$unaligned_command +__FASTQ__ +) +# Wait for bcl2fastq to complete +if [[ -n \$bcl_jobid ]]; then + bcl_dependency=\$(echo \$bcl_jobid | sed -e 's/^/--dependency=afterok:/g') +fi +__SUBMIT_BCL2FASTQ_CMD__ +fi + # Not miniseq cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash @@ -660,21 +732,7 @@ __BARCODES__ PROCESSING="\$PROCESSING,\$bcjobid" done -# bcl2fastq -bcl_jobid=\$(sbatch --export=ALL -J "u-$flowcell" -o "u-$flowcell.o%A" -e "u-$flowcell.e%A" --partition=$queue --ntasks=1 --cpus-per-task=20 --mem-per-cpu=8000 --parsable --oversubscribe <<'__FASTQ__' -#!/bin/bash - -set -x -e -o pipefail -cd "$illumina_dir" - -$unaligned_command - -__FASTQ__ -) - -if [[ -n \$bcl_jobid ]]; then - bcl_dependency=\$(echo \$bcl_jobid | sed -e 's/^/--dependency=afterok:/g') -fi +$submit_bcl2fastq_cmd sbatch --export=ALL -J queuedemux-$flowcell -o "queuedemux-$flowcell.o%A" -e "queuedemux-$flowcell.e%A" \$bcl_dependency --partition $queue --ntasks=1 --cpus-per-task=1 --mem-per-cpu=1000 --parsable --oversubscribe <<__PART2__ #!/bin/bash From 3dceeee6b835cf565a76aabfa7e409734b5bc647 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 13 Jun 2024 11:33:36 -0700 Subject: [PATCH 142/172] Remove TODO that's already to-done --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index e8d297ee..25aa3514 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -301,7 +301,7 @@ read -d '' novaseq_submit_command <<_NOVA_SUBMIT_CMD_ PROCESSING= for samplesheet in SampleSheet.withmask*csv ; do for lane in {1..8} ; do - # TODO: Skip submission if lane not in this samplesheet + # Skip submission if lane not in this samplesheet if ! (cut -d, -f1 \$samplesheet | sort -u | grep -q \$lane) ; then echo "Lane \$lane not in samplesheet \$samplesheet, skipping" continue From fa094044736e6c4e7dfb116179fe26d3c3810815 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 27 Jun 2024 11:56:25 -0700 Subject: [PATCH 143/172] setup.sh - fix copy for FC with no samples This should fix the copy error for flowcells that have only LibraryPools, and no traditional Samples to copy. --- scripts/flowcells/setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 25aa3514..369db55f 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -804,6 +804,7 @@ rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" ( cd "$copy_from_dir" for dir in Project*/Sample* ; do + [[ -d "\$dir" ]] || continue samp_number=\$(sed 's/.*DS\([0-9]*\).*/\1/' <<< "\$dir") [[ -n "\$samp_number" ]] destination=\$(jq -c -r ".libraries[] | select(.sample == \$samp_number) | .project_share_directory" ../processing.json) From 7be986db8f1ad3922c9716bc814580b666158b18 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 27 Jun 2024 12:01:26 -0700 Subject: [PATCH 144/172] Revert "setup.sh - fix copy for FC with no samples" This reverts commit fa094044736e6c4e7dfb116179fe26d3c3810815. Shouldn't have pushed directly to main, reverting in favor of PR. --- scripts/flowcells/setup.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 369db55f..25aa3514 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -804,7 +804,6 @@ rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" ( cd "$copy_from_dir" for dir in Project*/Sample* ; do - [[ -d "\$dir" ]] || continue samp_number=\$(sed 's/.*DS\([0-9]*\).*/\1/' <<< "\$dir") [[ -n "\$samp_number" ]] destination=\$(jq -c -r ".libraries[] | select(.sample == \$samp_number) | .project_share_directory" ../processing.json) From 385f655ff65c78241596402dca57108d64913f31 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 27 Jun 2024 12:04:59 -0700 Subject: [PATCH 145/172] Fix for copy for Flowcells without samples Some flowcells have only LibraryPools, and no traditional samples. This change should fix processing for those flowcells so that they no longer fail to copy. --- scripts/flowcells/setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 25aa3514..9503753e 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -804,6 +804,7 @@ rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" ( cd "$copy_from_dir" for dir in Project*/Sample* ; do + [[ -d \$dir ]] || continue samp_number=\$(sed 's/.*DS\([0-9]*\).*/\1/' <<< "\$dir") [[ -n "\$samp_number" ]] destination=\$(jq -c -r ".libraries[] | select(.sample == \$samp_number) | .project_share_directory" ../processing.json) From 5bcb116cc79fe9d0294f21ac9c42df1e6793d228 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 7 Jul 2024 06:59:59 -0700 Subject: [PATCH 146/172] Update collate wait to include LPs Thanks to Richard for finding this! --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 9503753e..88c42297 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -890,7 +890,7 @@ python3 "$STAMPIPES/scripts/apilaneprocess.py" \ bash collate.bash # Wait for collation jobs to finish -while ( squeue -o "%j" | grep -q '^.collatefqDS.*$flowcell') ; do +while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do sleep 60 done From b683211c222380addc9cbaab3d181e26753f924b Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 7 Jul 2024 07:01:22 -0700 Subject: [PATCH 147/172] Remove duplicate collate/fastqc-script creation This block is repeated at the beginning of the collation step, and so doesn't need to be run here, at the end of the copy script. --- scripts/flowcells/setup.sh | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 88c42297..d85fc299 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -829,30 +829,6 @@ rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" done ) - -# create fastqc and collation scripts -cd "$analysis_dir" -# Remove existing scripts if they exist (to avoid appending) -rm -f fastqc.bash collate.bash run.bash - -# Create fastqc scripts -python3 "$STAMPIPES/scripts/apilaneprocess.py" \ - --script_template "$STAMPIPES/processes/fastq/fastqc.bash" \ - --qsub-prefix .fq \ - --queue $queue \ - --sample-script-basename fastqc.bash \ - --flowcell_label "$flowcell" \ - --outfile fastqc.bash - -# Create collation scripts -python3 "$STAMPIPES/scripts/apilaneprocess.py" \ - --script_template "$STAMPIPES/processes/fastq/collate_fastq.bash" \ - --qsub-prefix .collatefq \ - --queue $queue \ - --sample-script-basename "collate.bash" \ - --flowcell_label "$flowcell" \ - --outfile collate.bash - __COPY__ ) From 92f60590f6101eca17f41f8d130142e5a2ed5b88 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 10:22:18 -0700 Subject: [PATCH 148/172] style: Format python codebase with ruff --- processes/altcode/bin/mtx_to_h5.py | 17 +- processes/altcode/bin/summarize_stats.py | 51 +- processes/altseq/bin/analyze.py | 20 +- processes/altseq/bin/generate_counts_json.py | 33 +- processes/bwa/aggregate/plot_footprints.py | 118 +-- scripts/aggregatecollate.py | 248 ++++-- scripts/aggregateprocess.py | 419 ++++++--- scripts/alignprocess.py | 401 ++++++--- scripts/altcode/upload_fastq.py | 71 +- scripts/altcode/upload_stats.py | 74 +- scripts/altseq/upload_data.py | 45 +- scripts/apilaneprocess.py | 278 ++++-- scripts/bam/bamfaiordercompare.py | 12 +- scripts/bam/mark_dups.py | 21 +- scripts/bam/move_umt_to_tag.py | 35 +- scripts/bam/random_reads.py | 18 +- .../browser/make_trackhubs_for_flowcell.py | 597 +++++++++---- .../browser/make_trackhubs_for_projects.py | 328 ++++--- .../make_browser_load.py | 678 ++++++++++----- scripts/browser/parse_all_projects.py | 44 +- scripts/bwa/aggregate/basic/sparse_motifs.py | 6 +- scripts/bwa/bamcounts.py | 145 ++-- scripts/bwa/filter_reads.py | 144 ++-- scripts/bwa/fix_bam_pairing.py | 122 +-- scripts/cluster/monitor_alignments.py | 122 ++- scripts/copy_notify.py | 75 +- scripts/create_processing.py | 325 ++++--- scripts/fastq/takara_umt.py | 34 +- scripts/flowcells/barcode_check.py | 78 +- .../barcode_count_from_stats_file.py | 88 +- scripts/flowcells/barcode_masks.py | 46 +- scripts/flowcells/barcode_report.py | 153 ++-- scripts/flowcells/demux_fastq.py | 186 ++-- scripts/flowcells/link_nextseq.py | 59 +- scripts/flowcells/link_rapidrun.py | 100 ++- scripts/flowcells/make_samplesheets.py | 132 ++- scripts/flowcells/max_mismatch.py | 102 ++- scripts/flowcells/test_barcode_masks.py | 54 +- scripts/helpers/expand_multiple_alignments.py | 92 +- scripts/laneprocess.py | 227 +++-- scripts/lims/aggregation/get_files.py | 143 ++-- scripts/lims/alignment/get_files.py | 117 ++- scripts/lims/create_altseq_sample_config.py | 57 +- scripts/lims/get_processing.py | 169 ++-- scripts/lims/movetag.py | 115 ++- scripts/lims/upload_aggregation_stats.py | 64 +- scripts/lims/upload_data.py | 800 +++++++++++------- scripts/poolprocess.py | 601 ++++++++----- scripts/umi/extract_umt.py | 48 +- scripts/umi/fastq_umi_add.py | 19 +- scripts/utility/lorentz.py | 7 +- scripts/utility/md5check.py | 14 +- scripts/utility/movesymlinks.py | 74 +- scripts/utility/picard_inserts_process.py | 42 +- scripts/versions.py | 5 +- 55 files changed, 5232 insertions(+), 2841 deletions(-) diff --git a/processes/altcode/bin/mtx_to_h5.py b/processes/altcode/bin/mtx_to_h5.py index cf3e8d0b..9cfe7120 100755 --- a/processes/altcode/bin/mtx_to_h5.py +++ b/processes/altcode/bin/mtx_to_h5.py @@ -12,8 +12,9 @@ def parser_setup(): parser.add_argument("mtx_directory", help="the directory containing the mtx files") parser.add_argument("output", help="the name of the output file") parser.add_argument("--metadata", help="A JSON-formatted file of metadata to add") - parser.add_argument("--compress", action="store_true", - help="Compress output with gzip") + parser.add_argument( + "--compress", action="store_true", help="Compress output with gzip" + ) return parser @@ -21,15 +22,13 @@ def lists_to_dicts(data): # Recursively converts lists to dicts # Required because of this issue https://github.com/scverse/anndata/issues/708 if isinstance(data, list): - return { - f"_{idx}": lists_to_dicts(elem) - for idx, elem in enumerate(data) - } + return {f"_{idx}": lists_to_dicts(elem) for idx, elem in enumerate(data)} if isinstance(data, dict): for key in list(data.keys()): data[key] = lists_to_dicts(data[key]) return data + def convert(input_dir, output_file, compress=False, metadata=None): data = sc.read_10x_mtx(input_dir, cache=False) if metadata is not None: @@ -39,6 +38,7 @@ def convert(input_dir, output_file, compress=False, metadata=None): comp_method = "gzip" if compress else None data.write(filename=output_file, compression=comp_method) + def main(): poptions = parser_setup().parse_args() if not poptions.output.endswith("h5ad"): @@ -52,7 +52,10 @@ def main(): else: metadata = None - convert(poptions.mtx_directory, poptions.output, poptions.compress, metadata=metadata) + convert( + poptions.mtx_directory, poptions.output, poptions.compress, metadata=metadata + ) + if __name__ == "__main__": main() diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index 8f29c38b..64ffa9e5 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -12,6 +12,7 @@ from collections import defaultdict + def parse_args(): parser = argparse.ArgumentParser( prog="summarize_stats.py", @@ -33,7 +34,7 @@ def parse_cellreads(filename): with open(filename) as f: data = [] for row in csv.DictReader(f, delimiter="\t"): - for (k, v) in row.items(): + for k, v in row.items(): try: row[k] = int(v) except: @@ -41,6 +42,7 @@ def parse_cellreads(filename): data.append(row) return data + def parse_summary_stats(filename): with open(filename) as f: data = {} @@ -61,6 +63,7 @@ def parse_summary_stats(filename): data[key] = val return data + def parse_barcode_stats(filename): with open(filename) as f: data = {} @@ -70,12 +73,16 @@ def parse_barcode_stats(filename): data[key] = val return data + REVCOM = {"A": "T", "T": "A", "C": "G", "G": "C"} + + def revcom(bc): if bc is None: return None return "".join(REVCOM[x] for x in reversed(bc)) + def summarize_by_library(pool_info, stats): """ Stats is a list of observed cell barcodes & how many we saw / how well they @@ -85,22 +92,25 @@ def summarize_by_library(pool_info, stats): For example, a library with barcode2 = "TTTAAGCG" will contain all cells that end with "_CGCTTAAA" (the reverse complement) """ + def build_barcode_to_library_lookup(pool_info, stats): barcode_to_library = {} for lib in pool_info["libraries"]: - #bc = revcom(lib["barcode2"]) + # bc = revcom(lib["barcode2"]) # Some old backward-compatibility # Newer stuff is at the top of the list if "sample_barcode" in lib: bc = lib["sample_barcode"] elif "barcode2" in lib: bc = revcom(lib["sample_barcode"]) - elif "additional_information" in lib and "barcode2" in lib["additional_information"]: + elif ( + "additional_information" in lib + and "barcode2" in lib["additional_information"] + ): bc = revcom(lib["additional_information"]["sample_barcode"]) barcode_to_library[bc] = lib["LN#"] return barcode_to_library - # Stub out keys data = { "barcode_mapping": {}, @@ -122,20 +132,24 @@ def build_barcode_to_library_lookup(pool_info, stats): (_, _1, bc) = total_bc.split("_") if bc not in libraries: libraries[bc] = defaultdict(int) - for (k, v) in cell.items(): + for k, v in cell.items(): if k == "CB": continue libraries[bc][k] += int(v) # Convert back to strings (ew) for bc in libraries: - for (k, v) in libraries[bc].items(): + for k, v in libraries[bc].items(): libraries[bc][k] = str(v) pool_set = set(lib["library_pool"] for lib in pool_info["libraries"]) assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set data["pool"] = pool_set.pop() - flowcell_set = set(lib["additional_information"]["flowcell"] for lib in pool_info["libraries"]) - assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + flowcell_set = set( + lib["additional_information"]["flowcell"] for lib in pool_info["libraries"] + ) + assert len(flowcell_set) == 1, ( + "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + ) data["flowcell_label"] = flowcell_set.pop()[2:] data["barcode_mapping"] = bc_to_library @@ -143,6 +157,7 @@ def build_barcode_to_library_lookup(pool_info, stats): return data + def summarize_by_sample(pool_info, stats): """ Stats is a list of observed cell barcodes & how many we saw / how well they @@ -152,6 +167,7 @@ def summarize_by_sample(pool_info, stats): For example, a library with barcode2 = "TTTAAGCG" will contain all cells that end with "_CGCTTAAA" (the reverse complement) """ + def build_barcode_to_sample_lookup(pool_info, stats): barcode_to_sample = {} for lib in pool_info["libraries"]: @@ -159,7 +175,6 @@ def build_barcode_to_sample_lookup(pool_info, stats): barcode_to_sample[bc] = lib["sample"] return barcode_to_sample - # Stub out keys data = { "barcode_mapping": {}, @@ -181,20 +196,24 @@ def build_barcode_to_sample_lookup(pool_info, stats): (_, _1, bc) = total_bc.split("_") if bc not in samples: samples[bc] = defaultdict(int) - for (k, v) in cell.items(): + for k, v in cell.items(): if k == "CB": continue samples[bc][k] += int(v) # Convert back to strings (ew) for bc in samples: - for (k, v) in samples[bc].items(): + for k, v in samples[bc].items(): samples[bc][k] = str(v) pool_set = set(lib["library_pool"] for lib in pool_info["libraries"]) assert len(pool_set) == 1, "Should have exactly 1 pool, instead: %s" % pool_set data["pool"] = pool_set.pop() - flowcell_set = set(lib["additional_information"]["flowcell"] for lib in pool_info["libraries"]) - assert len(flowcell_set) == 1, "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + flowcell_set = set( + lib["additional_information"]["flowcell"] for lib in pool_info["libraries"] + ) + assert len(flowcell_set) == 1, ( + "Pool should have exactly 1 flowcell, instead %s" % flowcell_set + ) data["flowcell_label"] = flowcell_set.pop()[2:] data["barcode_mapping"] = bc_to_sample @@ -202,6 +221,7 @@ def build_barcode_to_sample_lookup(pool_info, stats): return data + def main(): opts = parse_args() cfg = parse_pool_info(opts.pool_info_file) @@ -213,9 +233,12 @@ def main(): data = summarize_by_library(cfg, samples) data["summary_stats"] = parse_summary_stats(os.path.join(gene_dir, "Summary.csv")) - data["barcode_stats"] = parse_barcode_stats(os.path.join(opts.solo_dir, "Barcodes.stats")) + data["barcode_stats"] = parse_barcode_stats( + os.path.join(opts.solo_dir, "Barcodes.stats") + ) print(json.dumps(data)) + if __name__ == "__main__": main() diff --git a/processes/altseq/bin/analyze.py b/processes/altseq/bin/analyze.py index 31bd661f..f585c01b 100755 --- a/processes/altseq/bin/analyze.py +++ b/processes/altseq/bin/analyze.py @@ -6,6 +6,7 @@ import pathlib import pprint + def parse_args(): parser = argparse.ArgumentParser( prog="analyze.py", @@ -25,26 +26,32 @@ def parse_barcode_config(filename): cfg[barcode] = name return cfg + def parse_cellreads(filename): with open(filename) as f: return [*csv.DictReader(f, delimiter="\t")] + def write_sample(output_directory, sample): if not sample.get("name", None): # Skip barcodes not in our list return output = os.path.join(output_directory, ("%s.stats.txt" % sample["name"])) output_keys = [ - "cbMatch", "cbPerfect", - "exonic", "intronic", + "cbMatch", + "cbPerfect", + "exonic", + "intronic", "mito", - "genomeU", "genomeM", - "featureU", "featureM", + "genomeU", + "genomeM", + "featureU", + "featureM", "nGenesUnique", "exonicAS", "intronicAS", ] - with open(output, 'w') as f: + with open(output, "w") as f: for key in output_keys: if key in sample: f.write("%s\t%s\n" % (key, sample[key])) @@ -55,11 +62,12 @@ def main(): cfg = parse_barcode_config(opts.barcode_config_file) samples = parse_cellreads(opts.cellreads) for sample in samples: - sample['name'] = cfg.get(sample['CB'], None) + sample["name"] = cfg.get(sample["CB"], None) pathlib.Path(opts.output_directory).mkdir(parents=True, exist_ok=True) for sample in samples: write_sample(opts.output_directory, sample) + if __name__ == "__main__": main() diff --git a/processes/altseq/bin/generate_counts_json.py b/processes/altseq/bin/generate_counts_json.py index f92ff012..7b092470 100755 --- a/processes/altseq/bin/generate_counts_json.py +++ b/processes/altseq/bin/generate_counts_json.py @@ -4,9 +4,11 @@ import csv import os import pathlib + # import pprint import json + def parse_args(): parser = argparse.ArgumentParser( prog="generate_counts_json.py", @@ -17,11 +19,13 @@ def parse_args(): parser.add_argument("pool_name") return parser.parse_args() + def parse_tsv(filename): """Parses a TSV with header, return list of dicts""" with open(filename) as f: return [*csv.DictReader(f, delimiter="\t")] + def parse_linewise_stats(filename): """Parses a file with a name-value pair on each line, separated by whitespace""" d = {} @@ -31,6 +35,7 @@ def parse_linewise_stats(filename): d[key] = value return d + def parse_linewise_csv_stats(filename): """Parses a file with a name-value pair on each line, separated by comma""" d = {} @@ -51,8 +56,9 @@ def parse_barcode_config(filename): cfg[cell_barcode] = sample_name return cfg + def modify_sample_info(info): - """ Rewrite the sample stats a bit """ + """Rewrite the sample stats a bit""" # Keys to delete from the table deletes = [ "CB", @@ -69,6 +75,7 @@ def modify_sample_info(info): del out[old] return out + def get_sample_stats(opts): """ Gets per-sample stats from the CellReads.stats file @@ -78,34 +85,39 @@ def get_sample_stats(opts): sample_counts = parse_tsv(cellreads_path) sample_stats = { - #cfg.get(info['CB']): modify_sample_info(info) - info['CB']: modify_sample_info(info) + # cfg.get(info['CB']): modify_sample_info(info) + info["CB"]: modify_sample_info(info) for info in sample_counts - if info['CB'] in cfg + if info["CB"] in cfg } - #del sample_stats[None] + # del sample_stats[None] return sample_stats + def get_barcode_stats(opts): - """ Gets the stats about barcode mapping """ + """Gets the stats about barcode mapping""" barcode_path = os.path.join(opts.cellranger_directory, "..", "Barcodes.stats") return parse_linewise_stats(barcode_path) + def get_summary_stats(opts): - """ Gets the Summary stats produced by StarSOLO """ + """Gets the Summary stats produced by StarSOLO""" barcode_path = os.path.join(opts.cellranger_directory, "Summary.csv") return parse_linewise_csv_stats(barcode_path) + def get_library_pool_info(opts): - """ Gets the metadata about the library and pool """ + """Gets the metadata about the library and pool""" (flowcell, pool) = opts.pool_name.split("_") return {"flowcell_label": flowcell, "pool": pool} + def get_barcode_mapping(opts): - """ Returns the mapping of barcodes to sample names """ + """Returns the mapping of barcodes to sample names""" cfg = parse_barcode_config(opts.barcode_config_file) return cfg + def get_all_stats(opts): """ Return all the stats and metadata that this script gathers @@ -123,10 +135,11 @@ def get_all_stats(opts): def main(): - """ Run it all and write to stdout """ + """Run it all and write to stdout""" opts = parse_args() data = get_all_stats(opts) print(json.dumps(data)) + if __name__ == "__main__": main() diff --git a/processes/bwa/aggregate/plot_footprints.py b/processes/bwa/aggregate/plot_footprints.py index d4c89b00..45cb598c 100755 --- a/processes/bwa/aggregate/plot_footprints.py +++ b/processes/bwa/aggregate/plot_footprints.py @@ -16,18 +16,20 @@ # In[3]: -#get_ipython().magic(u'matplotlib inline') +# get_ipython().magic(u'matplotlib inline') import matplotlib -matplotlib.use('agg') + +matplotlib.use("agg") import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator import matplotlib.gridspec as gridspec from pylab import rcParams -rcParams['pdf.fonttype'] = 42 -#plt.switch_backend('agg') +rcParams["pdf.fonttype"] = 42 + +# plt.switch_backend('agg') # In[4]: @@ -45,52 +47,52 @@ r = np.array([dm.r[x] for x in xx]) p = np.array([dm.p[x] for x in xx]) -mu = p*r/(1.0-p) +mu = p * r / (1.0 - p) fit_mu = np.array([dm.fit_mu(x) for x in xx]) fit_r = np.array([dm.fit_r(x) for x in xx]) fig = plt.figure() -gs = gridspec.GridSpec(1, 2, wspace = 0.5) +gs = gridspec.GridSpec(1, 2, wspace=0.5) ax = fig.add_subplot(gs[0, 0]) -ax.plot(xx, mu, label = "mle fit") -#ax.plot(xx, fit_mu, label = "smoothed parameters fit") -ax.plot([0, 100], [0, 100], color = 'grey', ls = '--', zorder=-10) +ax.plot(xx, mu, label="mle fit") +# ax.plot(xx, fit_mu, label = "smoothed parameters fit") +ax.plot([0, 100], [0, 100], color="grey", ls="--", zorder=-10) ax.set_xlabel("Expected cleavages") ax.set_ylabel("Mean (mu) observed cleavages") [ax.spines[loc].set_color("none") for loc in ["top", "right"]] ax.xaxis.set_ticks_position("bottom") -ax.xaxis.set_tick_params(direction = "out") -ax.xaxis.set(major_locator = MaxNLocator(4)) +ax.xaxis.set_tick_params(direction="out") +ax.xaxis.set(major_locator=MaxNLocator(4)) ax.yaxis.set_ticks_position("left") -ax.yaxis.set_tick_params(direction = "out") -ax.yaxis.set(major_locator = MaxNLocator(4)) +ax.yaxis.set_tick_params(direction="out") +ax.yaxis.set(major_locator=MaxNLocator(4)) ax.legend() ax = fig.add_subplot(gs[0, 1]) -ax.plot(xx[1:], 1/r[1:]) -ax.plot(xx[1:], 1/fit_r[1:]) +ax.plot(xx[1:], 1 / r[1:]) +ax.plot(xx[1:], 1 / fit_r[1:]) ax.set_xlabel("Expected cleavages") ax.set_ylabel("1/r") [ax.spines[loc].set_color("none") for loc in ["top", "right"]] ax.xaxis.set_ticks_position("bottom") -ax.xaxis.set_tick_params(direction = "out") -ax.xaxis.set(major_locator = MaxNLocator(4)) +ax.xaxis.set_tick_params(direction="out") +ax.xaxis.set(major_locator=MaxNLocator(4)) ax.yaxis.set_ticks_position("left") -ax.yaxis.set_tick_params(direction = "out") -ax.yaxis.set(major_locator = MaxNLocator(4)) +ax.yaxis.set_tick_params(direction="out") +ax.yaxis.set(major_locator=MaxNLocator(4)) fig.set_size_inches(6, 2.5) @@ -100,59 +102,65 @@ # In[6]: -def step(arr, xaxis = False, interval = 0): +def step(arr, xaxis=False, interval=0): if xaxis and interval == 0: interval = abs(arr[1] - arr[0]) / 2.0 newarr = np.array(zip(arr - interval, arr + interval)).ravel() return newarr + def fill_between(arr, ax, **kwargs): - ax.fill_between(step(np.arange(arr.shape[0]), xaxis = True), step(np.zeros(arr.shape[0])), step(arr), **kwargs) - - -def make_density_fit_plot(i, dm, ax, lo = 0, hi = 125, include_poisson = False): - + ax.fill_between( + step(np.arange(arr.shape[0]), xaxis=True), + step(np.zeros(arr.shape[0])), + step(arr), + **kwargs, + ) + + +def make_density_fit_plot(i, dm, ax, lo=0, hi=125, include_poisson=False): xx = np.arange(lo, hi) - + mu = dm.fit_mu(i) r = dm.fit_r(i) - p = r/(r+mu) + p = r / (r + mu) - #ax.step(xx, dm.h[i, xx]/np.sum(dm.h[i,:]), label = "Observed cleavages") - fill_between(dm.h[i, xx]/np.sum(dm.h[i,:]), ax, facecolor='lightgrey', edgecolor='none') - - ax.plot(xx, scipy.stats.nbinom.pmf(xx, r, p), label = "Negative binomial fit") + # ax.step(xx, dm.h[i, xx]/np.sum(dm.h[i,:]), label = "Observed cleavages") + fill_between( + dm.h[i, xx] / np.sum(dm.h[i, :]), ax, facecolor="lightgrey", edgecolor="none" + ) + + ax.plot(xx, scipy.stats.nbinom.pmf(xx, r, p), label="Negative binomial fit") if include_poisson: - ax.plot(xx, scipy.stats.poisson.pmf(xx, i), label = "Poisson (lambda = %d)" % i) - + ax.plot(xx, scipy.stats.poisson.pmf(xx, i), label="Poisson (lambda = %d)" % i) + ax.set_xlabel("Cleavages") ax.set_ylabel("Density") [ax.spines[loc].set_color("none") for loc in ["top", "right"]] ax.xaxis.set_ticks_position("bottom") - ax.xaxis.set_tick_params(direction = "out") - ax.xaxis.set(major_locator = MaxNLocator(4)) - + ax.xaxis.set_tick_params(direction="out") + ax.xaxis.set(major_locator=MaxNLocator(4)) + ax.yaxis.set_ticks_position("left") - ax.yaxis.set_tick_params(direction = "out") - ax.yaxis.set(major_locator = MaxNLocator(4)) - + ax.yaxis.set_tick_params(direction="out") + ax.yaxis.set(major_locator=MaxNLocator(4)) # In[9]: fig = plt.figure() -gs = gridspec.GridSpec(1, 4, wspace = 0.5) +gs = gridspec.GridSpec(1, 4, wspace=0.5) for i, j in enumerate([5, 15, 25, 65]): - ax = fig.add_subplot(gs[0,i]) + ax = fig.add_subplot(gs[0, i]) make_density_fit_plot(j, dm, ax, include_poisson=True) ax.set_xlim(right=125) - -ax.legend() + +ax.legend() fig.set_size_inches(12, 2) @@ -164,38 +172,34 @@ def make_density_fit_plot(i, dm, ax, lo = 0, hi = 125, include_poisson = False): fig, ax = plt.subplots() -deltas = np.arange(0.5, 0, step = -0.1) +deltas = np.arange(0.5, 0, step=-0.1) -xx = np.arange(250, dtype = np.float, step = 10) +xx = np.arange(250, dtype=np.float, step=10) for delta in deltas: - y = dm.p_values(xx, xx*delta) - plt.plot(xx, np.log10(y), label = delta) + y = dm.p_values(xx, xx * delta) + plt.plot(xx, np.log10(y), label=delta) ax.set_xlabel("Expected cleavage") ax.set_ylabel("Significance (-log10 p-value) of depletion") [ax.spines[loc].set_color("none") for loc in ["top", "right"]] ax.xaxis.set_ticks_position("bottom") -ax.xaxis.set_tick_params(direction = "out") -ax.xaxis.set(major_locator = MaxNLocator(6)) +ax.xaxis.set_tick_params(direction="out") +ax.xaxis.set(major_locator=MaxNLocator(6)) ax.yaxis.set_ticks_position("left") -ax.yaxis.set_tick_params(direction = "out") -ax.yaxis.set(major_locator = MaxNLocator(4)) +ax.yaxis.set_tick_params(direction="out") +ax.yaxis.set(major_locator=MaxNLocator(4)) -ax.grid(axis = 'y') +ax.grid(axis="y") fig.set_size_inches(6, 3) -#ax.legend(bbox_to_anchor=(0., 1.05, 1., .105), loc=3, +# ax.legend(bbox_to_anchor=(0., 1.05, 1., .105), loc=3, # ncol=5, mode="expand", borderaxespad=0., title = "Cleavage depletion ratio (obs/exp) at nucleotide") plt.savefig("dispersion.power.analysis.pdf") # In[ ]: - - - - diff --git a/scripts/aggregatecollate.py b/scripts/aggregatecollate.py index b47c6450..fa77ff39 100644 --- a/scripts/aggregatecollate.py +++ b/scripts/aggregatecollate.py @@ -6,7 +6,7 @@ import requests import subprocess -sys.path.append('/home/audrakj/stamlims_api') +sys.path.append("/home/audrakj/stamlims_api") print(sys.path) from stamlims_api import rest @@ -15,7 +15,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_options = { "quiet": False, @@ -30,56 +30,105 @@ "dry_run": False, "aggregation_base_directory": None, "aggregation_directory": None, - "script_template": os.path.join(STAMPIPES, 'processes/fastq', 'collate_aggregation_fastq.bash'), + "script_template": os.path.join( + STAMPIPES, "processes/fastq", "collate_aggregation_fastq.bash" + ), } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS. Required if not in the environment under LIMS_API_URL") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required if not in the environment under LIMS_API_TOKEN") - parser.add_argument("--aggregation_base_directory", dest="aggregation_base_directory", - help="The base directory to put aggregations in. Can get from environment AGGREGATIONS variable") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") - parser.add_argument("--overwrite", dest="overwrite", action="store_true", - help="Create a new outfile instead of appending commands.") - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - parser.add_argument("--aggregation_directory", dest="aggregation_directory", - help="The directory for the aggregation. Will deduce if not given.") - parser.add_argument("-b", "--script_basename", dest="script_name", - help="Name of the script that goes after the sample name.") - - parser.add_argument("--tag", dest="tag", - help="Run for alignments tagged here.") - parser.add_argument("--aggregation", dest="aggregation_ids", type=int, action="append", - help="Run for these aggregations (can be used more than once).") - - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS. Required if not in the environment under LIMS_API_URL", + ) + parser.add_argument( + "-t", + "--token", + dest="token", + help="Your authentication token. Required if not in the environment under LIMS_API_TOKEN", + ) + parser.add_argument( + "--aggregation_base_directory", + dest="aggregation_base_directory", + help="The base directory to put aggregations in. Can get from environment AGGREGATIONS variable", + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this alignment to this file.", + ) + parser.add_argument( + "--overwrite", + dest="overwrite", + action="store_true", + help="Create a new outfile instead of appending commands.", + ) + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + parser.add_argument( + "--aggregation_directory", + dest="aggregation_directory", + help="The directory for the aggregation. Will deduce if not given.", + ) + parser.add_argument( + "-b", + "--script_basename", + dest="script_name", + help="Name of the script that goes after the sample name.", + ) + + parser.add_argument("--tag", dest="tag", help="Run for alignments tagged here.") + parser.add_argument( + "--aggregation", + dest="aggregation_ids", + type=int, + action="append", + help="Run for these aggregations (can be used more than once).", + ) + + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - - def __init__(self, args, aggregation_base_directory): - + def __init__(self, args, aggregation_base_directory): self.api = rest.setup_api() self.qsub_scriptname = args.script_name self.qsub_prefix = args.qsub_prefix @@ -91,17 +140,20 @@ def __init__(self, args, aggregation_base_directory): self.overwrite = args.overwrite def get_aggregation_info(self, aggregation_id): - results = self.api.single_result("aggregation/%d" % aggregation_id) if not results: - logging.error("Could not find information for aggregation %d" % aggregation_id) + logging.error( + "Could not find information for aggregation %d" % aggregation_id + ) return None return results def get_aggregation_lanes(self, aggregation_id): - results = self.api.list_result("aggregation_lane/?aggregation=%d&include=True" % aggregation_id) + results = self.api.list_result( + "aggregation_lane/?aggregation=%d&include=True" % aggregation_id + ) if not results: logging.error("Could not find lanes for aggregation %d" % aggregation_id) @@ -110,16 +162,27 @@ def get_aggregation_lanes(self, aggregation_id): return results def get_lane_fastq_file(self, aggregation_id, lane_id, file_purpose): - logging.info("Fetching files for alignment %d (Aggregation %d)" % (lane_id, aggregation_id)) - - results = files.get_object_files(self.api, object_id=lane_id, object_content_type=40, purpose_shorthand=file_purpose) + logging.info( + "Fetching files for alignment %d (Aggregation %d)" + % (lane_id, aggregation_id) + ) + + results = files.get_object_files( + self.api, + object_id=lane_id, + object_content_type=40, + purpose_shorthand=file_purpose, + ) if not results: logging.error("Improperly executed file query") return None if len(results) != 1: - logging.error("Found %d files for alignment %d, require 1 (Aggregation %d)" % (len(results), lane_id, aggregation_id)) + logging.error( + "Found %d files for alignment %d, require 1 (Aggregation %d)" + % (len(results), lane_id, aggregation_id) + ) logging.error(results) return None file_info = results[0] @@ -129,14 +192,16 @@ def get_lane_fastq_file(self, aggregation_id, lane_id, file_purpose): def get_library_info(self, aggregation_info): library_info = self.api.single_result(url=aggregation_info["library"]) if not library_info: - logging.critical("Cannot proceed without library! Could not get info from %s (Aggregation %d)" % (aggregation_info["library"], aggregation_info["id"])) + logging.critical( + "Cannot proceed without library! Could not get info from %s (Aggregation %d)" + % (aggregation_info["library"], aggregation_info["id"]) + ) sys.exit(1) return library_info def get_script_template(self, script_template): - logging.info("Using script template %s" % script_template) - return open(script_template, 'r').read() + return open(script_template, "r").read() def get_example_flowcell(self, aggregation_id, aggregation_lanes): included = None @@ -148,18 +213,23 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): lane = self.api.single_result(url=aggregation_lane["lane"]) if not lane: - logging.critical("Was not able to fetch lane %s (Aggregation %d)" % (aggregation_lane["lane"], aggregation_id)) + logging.critical( + "Was not able to fetch lane %s (Aggregation %d)" + % (aggregation_lane["lane"], aggregation_id) + ) sys.exit(1) flowcell = self.api.single_result(url=lane["flowcell"]) if not flowcell: - logging.critical("Could not get flowcell at %d (Aggregation %d)" % (lane["flowcell"], aggregation_id)) + logging.critical( + "Could not get flowcell at %d (Aggregation %d)" + % (lane["flowcell"], aggregation_id) + ) sys.exit(1) return flowcell def add_script(self, aggregation_id, aggregation_folder, library_number): - if self.overwrite: mode = "w" else: @@ -167,20 +237,34 @@ def add_script(self, aggregation_id, aggregation_folder, library_number): with open(self.outfile, mode) as runfile: runfile.write("cd %s && " % aggregation_folder) - runfile.write("qsub -V -cwd -S /bin/bash -N \"%sLN%d_AGG#%d\" %s\n\n" % (self.qsub_prefix, library_number, aggregation_id, self.qsub_scriptname)) + runfile.write( + 'qsub -V -cwd -S /bin/bash -N "%sLN%d_AGG#%d" %s\n\n' + % ( + self.qsub_prefix, + library_number, + aggregation_id, + self.qsub_scriptname, + ) + ) def get_aggregation_directory(self, aggregation): - return files.get_object_directories(self.api, object=aggregation, purpose_shorthand='aggregation-directory')[0]["path"] + return files.get_object_directories( + self.api, object=aggregation, purpose_shorthand="aggregation-directory" + )[0]["path"] def setup_tag(self, tag_slug): - query_arguments = {'content_type': 126, 'tag__slug': tag_slug} + query_arguments = {"content_type": 126, "tag__slug": tag_slug} - aggregation_tags = self.api.list_result(url_addition="tagged_object/", query_arguments=query_arguments) + aggregation_tags = self.api.list_result( + url_addition="tagged_object/", query_arguments=query_arguments + ) - [self.setup_aggregation(aggregation_tag["object_id"]) for aggregation_tag in aggregation_tags] + [ + self.setup_aggregation(aggregation_tag["object_id"]) + for aggregation_tag in aggregation_tags + ] def setup_aggregation(self, aggregation_id): - aggregation = self.get_aggregation_info(aggregation_id) if not aggregation: @@ -205,21 +289,34 @@ def setup_aggregation(self, aggregation_id): for aggregation_lane in aggregation_lanes: lane_id = int(aggregation_lane["lane"].strip("/").split("/")[-1]) if not aggregation_lane["include"]: - logging.info("Not including lane %s (Aggregation %d)" % (lane_id, aggregation_id)) + logging.info( + "Not including lane %s (Aggregation %d)" % (lane_id, aggregation_id) + ) continue alignment_endpoint = aggregation_lane["alignment"] if not alignment_endpoint: - logging.info("Not including lane %s because no alignment set (Aggregation %d)" % (lane_id, aggregation_id)) + logging.info( + "Not including lane %s because no alignment set (Aggregation %d)" + % (lane_id, aggregation_id) + ) alignment_id = int(alignment_endpoint.strip("/").split("/")[-1]) - r1_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, 'r1-fastq') - r2_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, 'r2-fastq') + r1_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, "r1-fastq") + r2_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, "r2-fastq") if not r1_fastq or not r2_fastq: - logging.critical("Missing either R1: %s or R2: %s for alignment %s for lane %s, skipping (Aggregation %d)" % - (str(r1_fastq), str(r2_fastq), alignment_endpoint, lane_id, aggregation_id)) + logging.critical( + "Missing either R1: %s or R2: %s for alignment %s for lane %s, skipping (Aggregation %d)" + % ( + str(r1_fastq), + str(r2_fastq), + alignment_endpoint, + lane_id, + aggregation_id, + ) + ) missing = True continue else: @@ -228,7 +325,8 @@ def setup_aggregation(self, aggregation_id): r1_files.append(r1_fastq) r2_files.append(r2_fastq) - if missing: return False + if missing: + return False script_contents = self.get_script_template(self.script_template) @@ -253,8 +351,12 @@ def setup_aggregation(self, aggregation_id): script.write("export AGGREGATION_ID=%d\n" % aggregation_id) script.write("export LIBRARY=%d\n" % library_info["number"]) script.write("export LIBRARY_NAME=LN%d\n" % library_info["number"]) - script.write("export R1_FILES=\"%s\"\n" % " ".join([r1_file[0] for r1_file in r1_files])) - script.write("export R2_FILES=\"%s\"\n" % " ".join([r2_file[0] for r2_file in r2_files])) + script.write( + 'export R1_FILES="%s"\n' % " ".join([r1_file[0] for r1_file in r1_files]) + ) + script.write( + 'export R2_FILES="%s"\n' % " ".join([r2_file[0] for r2_file in r2_files]) + ) script.write("export AGGREGATION_FOLDER=%s\n" % aggregation_folder) script.write(script_contents) @@ -263,9 +365,10 @@ def setup_aggregation(self, aggregation_id): self.add_script(aggregation_id, aggregation_folder, library_info["number"]) -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -295,6 +398,7 @@ def main(args = sys.argv): if poptions.tag: process.setup_tag(poptions.tag) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index 94c3d1ad..f2bb1fe1 100644 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -5,6 +5,7 @@ import logging import requests from collections import OrderedDict + try: from concurrent.futures import ThreadPoolExecutor except ImportError: @@ -12,7 +13,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_options = { "quiet": False, @@ -31,60 +32,115 @@ "script_template": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS. Required if not in the environment under LIMS_API_URL") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required if not in the environment under LIMS_API_TOKEN") - parser.add_argument("--aggregation_base_directory", dest="aggregation_base_directory", - help="The base directory to put aggregations in. Can get from environment AGGREGATIONS variable") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this aggregation to this file.") - parser.add_argument("--overwrite", dest="overwrite", action="store_true", - help="Create a new outfile instead of appending commands.") - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - parser.add_argument("--aggregation_directory", dest="aggregation_directory", - help="The directory for the aggregation. Will deduce if not given.") - parser.add_argument("-b", "--script_basename", dest="script_name", - help="Name of the script that goes after the sample name.") - - parser.add_argument("--tag", dest="tag", - help="Run for aggregations tagged here.") - parser.add_argument("--aggregation", dest="aggregation_ids", type=int, action="append", - help="Run for these aggregations (can be used more than once).") - parser.add_argument("--project", dest="project", - help="Run for aggregations in this project.") - parser.add_argument("--flowcell", dest="flowcell", - help="Run for aggregations in this flowcell.") - - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("--qsub-queue", dest="qsub_queue", - help="Name of the SLURM partition to use.") - parser.add_argument("--listout", dest="simple_output", help="Write only a list of alignments to run, rather than a script to submit them", action="store_true") - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS. Required if not in the environment under LIMS_API_URL", + ) + parser.add_argument( + "-t", + "--token", + dest="token", + help="Your authentication token. Required if not in the environment under LIMS_API_TOKEN", + ) + parser.add_argument( + "--aggregation_base_directory", + dest="aggregation_base_directory", + help="The base directory to put aggregations in. Can get from environment AGGREGATIONS variable", + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this aggregation to this file.", + ) + parser.add_argument( + "--overwrite", + dest="overwrite", + action="store_true", + help="Create a new outfile instead of appending commands.", + ) + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + parser.add_argument( + "--aggregation_directory", + dest="aggregation_directory", + help="The directory for the aggregation. Will deduce if not given.", + ) + parser.add_argument( + "-b", + "--script_basename", + dest="script_name", + help="Name of the script that goes after the sample name.", + ) + + parser.add_argument("--tag", dest="tag", help="Run for aggregations tagged here.") + parser.add_argument( + "--aggregation", + dest="aggregation_ids", + type=int, + action="append", + help="Run for these aggregations (can be used more than once).", + ) + parser.add_argument( + "--project", dest="project", help="Run for aggregations in this project." + ) + parser.add_argument( + "--flowcell", dest="flowcell", help="Run for aggregations in this flowcell." + ) + + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument( + "--qsub-queue", dest="qsub_queue", help="Name of the SLURM partition to use." + ) + parser.add_argument( + "--listout", + dest="simple_output", + help="Write only a list of alignments to run, rather than a script to submit them", + action="store_true", + ) + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - def __init__(self, args, api_url, token, aggregation_base_directory): - self.token = token self.api_url = api_url self.qsub_scriptname = args.script_name @@ -98,7 +154,7 @@ def __init__(self, args, api_url, token, aggregation_base_directory): self.overwrite = args.overwrite self.session = requests.Session() - self.session.headers.update({'Authorization': "Token %s" % self.token}) + self.session.headers.update({"Authorization": "Token %s" % self.token}) self.simple_output = args.simple_output self.pool = ThreadPoolExecutor(max_workers=10) @@ -107,9 +163,8 @@ def __init__(self, args, api_url, token, aggregation_base_directory): os.remove(self.outfile) def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = self.session.get(url) @@ -122,7 +177,6 @@ def api_single_result(self, url_addition=None, url=None): return None def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -130,7 +184,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = self.session.get(url) @@ -148,17 +201,20 @@ def api_list_result(self, url_addition=None, url=None): return results def get_aggregation_info(self, aggregation_id): - results = self.api_single_result("aggregation/%d" % aggregation_id) if not results: - logging.error("Could not find information for aggregation %d" % aggregation_id) + logging.error( + "Could not find information for aggregation %d" % aggregation_id + ) return None return results def set_aggregation_folder(self, aggregation_info, library_info): - dir_name = os.path.join("LN%d" % library_info["number"], "aggregation-%d" % aggregation_info["id"]) + dir_name = os.path.join( + "LN%d" % library_info["number"], "aggregation-%d" % aggregation_info["id"] + ) share_dir = aggregation_info.get("project_share_directory") if share_dir: return os.path.join(share_dir, "aggregations", dir_name) @@ -166,16 +222,23 @@ def set_aggregation_folder(self, aggregation_info, library_info): if self.aggregation_base_directory: return os.path.join(self.aggregation_base_directory, dir_name) - url = "directory/?purpose__slug=all-alignments-bam&content_type=%d&object_id=%d" % (aggregation_info["object_content_type"], aggregation_info["id"]) + url = ( + "directory/?purpose__slug=all-alignments-bam&content_type=%d&object_id=%d" + % (aggregation_info["object_content_type"], aggregation_info["id"]) + ) results = self.api_list_result(url) if len(results) > 1: - logging.error("Found %d folders for aggregation %d, require 1" % (len(results), aggregation_info["id"])) + logging.error( + "Found %d folders for aggregation %d, require 1" + % (len(results), aggregation_info["id"]) + ) if len(results) == 0: - if not self.aggregation_base_directory: - logging.critical("Connot proceed, no directory set and no base aggregation directory given.") + logging.critical( + "Connot proceed, no directory set and no base aggregation directory given." + ) sys.exit(1) purpose_url = "file_purpose/?slug=all-alignments-bam" @@ -192,7 +255,8 @@ def set_aggregation_folder(self, aggregation_info, library_info): logging.info("Setting aggregation folder to %s" % path) data = { - "content_type": "%s/content_type/%d/" % (self.api_url, aggregation_info["object_content_type"]), + "content_type": "%s/content_type/%d/" + % (self.api_url, aggregation_info["object_content_type"]), "object_id": aggregation_info["id"], "path": path, "purpose": file_purpose, @@ -202,7 +266,10 @@ def set_aggregation_folder(self, aggregation_info, library_info): if not new_result.ok: logging.critical(new_result) - logging.critical("Could not upload new aggregation folder path to LIMS: %s" % json.dumps(data)) + logging.critical( + "Could not upload new aggregation folder path to LIMS: %s" + % json.dumps(data) + ) sys.exit(1) return path @@ -210,7 +277,9 @@ def set_aggregation_folder(self, aggregation_info, library_info): return results[0]["path"] def get_aggregation_lanes(self, aggregation_id): - results = self.api_list_result("aggregation_lane/?aggregation=%d&include=True" % aggregation_id) + results = self.api_list_result( + "aggregation_lane/?aggregation=%d&include=True" % aggregation_id + ) if not results: logging.error("Could not find lanes for aggregation %d" % aggregation_id) @@ -219,11 +288,16 @@ def get_aggregation_lanes(self, aggregation_id): return results def get_lane_alignments_file(self, aggregation_id, alignment_id): - - results = self.api_list_result("file/?purpose__slug=all-alignments-bam&content_type=47&object_id=%d" % alignment_id) + results = self.api_list_result( + "file/?purpose__slug=all-alignments-bam&content_type=47&object_id=%d" + % alignment_id + ) if len(results) != 1: - logging.error("Found %d files for alignment %d, require 1 (Aggregation %d)" % (len(results), alignment_id, aggregation_id)) + logging.error( + "Found %d files for alignment %d, require 1 (Aggregation %d)" + % (len(results), alignment_id, aggregation_id) + ) logging.error(results) return None file_info = results[0] @@ -231,22 +305,32 @@ def get_lane_alignments_file(self, aggregation_id, alignment_id): return (file_info["path"], file_info["md5sum"]) def get_trimmed_fastq_r1(self, aggregation_id, alignment_id): - - results = self.api_list_result("file/?purpose__slug=r1-fastq-trimmed&filetype__slug=gzipped-fastq&content_type=47&object_id=%d" % alignment_id) + results = self.api_list_result( + "file/?purpose__slug=r1-fastq-trimmed&filetype__slug=gzipped-fastq&content_type=47&object_id=%d" + % alignment_id + ) if len(results) != 1: - logging.error("Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" % (len(results), alignment_id, aggregation_id)) + logging.error( + "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" + % (len(results), alignment_id, aggregation_id) + ) logging.error(results) return None file_info = results[0] return (file_info["path"], file_info["md5sum"]) def get_trimmed_fastq_r2(self, aggregation_id, alignment_id): - - results= self.api_list_result("file/?purpose__slug=r2-fastq-trimmed&filetype__slug=gzipped-fastq&content_type=47&object_id=%d" % alignment_id) + results = self.api_list_result( + "file/?purpose__slug=r2-fastq-trimmed&filetype__slug=gzipped-fastq&content_type=47&object_id=%d" + % alignment_id + ) if len(results) != 1: - logging.error("Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" % (len(results), alignment_id, aggregation_id)) + logging.error( + "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" + % (len(results), alignment_id, aggregation_id) + ) logging.error(results) return None file_info = results[0] @@ -255,21 +339,31 @@ def get_trimmed_fastq_r2(self, aggregation_id, alignment_id): def get_library_info(self, aggregation_info): library_info = self.api_single_result(url=aggregation_info["library"]) if not library_info: - logging.critical("Cannot proceed without library! Could not get info from %s (Aggregation %d)" % (aggregation_info["library"], aggregation_info["id"])) + logging.critical( + "Cannot proceed without library! Could not get info from %s (Aggregation %d)" + % (aggregation_info["library"], aggregation_info["id"]) + ) sys.exit(1) return library_info def get_sample_info(self, aggregation_info): - sample_info = self.api_single_result(url=aggregation_info['library_details']["sample"]) + sample_info = self.api_single_result( + url=aggregation_info["library_details"]["sample"] + ) if not sample_info: - logging.critical("Cannot proceed without sample! Could not get info from %s (Aggregation %d)" % (aggregation_info["sample"], aggregation_info["id"])) + logging.critical( + "Cannot proceed without sample! Could not get info from %s (Aggregation %d)" + % (aggregation_info["sample"], aggregation_info["id"]) + ) sys.exit(1) return sample_info def get_genome_index(self, aggregation_info): genome_info = self.api_single_result(url=aggregation_info["genome_index"]) if not genome_info: - logging.critical("Could not get genome info! (Aggregation %d)" % aggregation_info["id"]) + logging.critical( + "Could not get genome info! (Aggregation %d)" % aggregation_info["id"] + ) sys.exit(1) return genome_info @@ -277,34 +371,50 @@ def get_genome_index_location(self, aggregation_id, aggregation_lanes): included = None for aggregation_lane in aggregation_lanes: if aggregation_lane["include"]: - included = aggregation_lane - break + included = aggregation_lane + break if not "alignment" in aggregation_lane or not aggregation_lane["alignment"]: - logging.critical("No alignment set for included aggregation lane %s" % str(aggregation_lane)) + logging.critical( + "No alignment set for included aggregation lane %s" + % str(aggregation_lane) + ) sys.exit(1) alignment = self.api_single_result(url=aggregation_lane["alignment"]) if not alignment: - logging.critical("Was not able to fetch alignment %s! (Aggregation %d)" % (aggregation_lane["alignment"], aggregation_id)) + logging.critical( + "Was not able to fetch alignment %s! (Aggregation %d)" + % (aggregation_lane["alignment"], aggregation_id) + ) sys.exit(1) genome_location = self.api_single_result(url=alignment["genome_index_location"]) if not genome_location: - logging.critical("Could not get genome location from alignment %d! (Aggregation %d)" % (included["id"], aggregation_id)) + logging.critical( + "Could not get genome location from alignment %d! (Aggregation %d)" + % (included["id"], aggregation_id) + ) sys.exit(1) - return os.path.join(genome_location["base_path"], genome_location["directory"], genome_location["filename"]) - - def get_script_template(self, aggregation_id, process_template_url, script_template=None): + return os.path.join( + genome_location["base_path"], + genome_location["directory"], + genome_location["filename"], + ) + def get_script_template( + self, aggregation_id, process_template_url, script_template=None + ): if script_template: logging.info("Using script template %s" % script_template) - return (open(script_template, 'r').read(), None) + return (open(script_template, "r").read(), None) if not process_template_url: - logging.critical("No process template for aggregation %d\n" % aggregation_id) + logging.critical( + "No process template for aggregation %d\n" % aggregation_id + ) return None logging.info("Getting process template %s" % process_template_url) @@ -312,11 +422,15 @@ def get_script_template(self, aggregation_id, process_template_url, script_templ process_template = self.api_single_result(url=process_template_url) if not process_template: - logging.critical("Could not find processing template for %s\n" % process_template_url) + logging.critical( + "Could not find processing template for %s\n" % process_template_url + ) return None - script_path = os.path.expandvars(process_template["process_version"]["script_location"]) - return (open(script_path, 'r').read(), process_template) + script_path = os.path.expandvars( + process_template["process_version"]["script_location"] + ) + return (open(script_path, "r").read(), process_template) def get_example_flowcell(self, aggregation_id, aggregation_lanes): included = None @@ -328,12 +442,18 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): lane = self.api_single_result(url=aggregation_lane["lane"]) if not lane: - logging.critical("Was not able to fetch lane %s (Aggregation %d)" % (aggregation_lane["lane"], aggregation_id)) + logging.critical( + "Was not able to fetch lane %s (Aggregation %d)" + % (aggregation_lane["lane"], aggregation_id) + ) sys.exit(1) flowcell = self.api_single_result(url=lane["flowcell"]) if not flowcell: - logging.critical("Could not get flowcell at %s (Aggregation %d)" % (lane["flowcell"], aggregation_id)) + logging.critical( + "Could not get flowcell at %s (Aggregation %d)" + % (lane["flowcell"], aggregation_id) + ) sys.exit(1) return flowcell @@ -344,11 +464,17 @@ def get_all_flowcell_paired(self, aggregation_id, aggregation_lanes): if aggregation_lane["include"]: lane = self.api_single_result(url=aggregation_lane["lane"]) if not lane: - logging.critical("Was not able to fetch lane %s (Aggregation %d)" % (aggregation_lane["lane"], aggregation_id)) + logging.critical( + "Was not able to fetch lane %s (Aggregation %d)" + % (aggregation_lane["lane"], aggregation_id) + ) sys.exit(1) flowcell = self.api_single_result(url=lane["flowcell"]) if not flowcell: - logging.critical("Could not get flowcell at %s (Aggregation %d)" % (lane["flowcell"], aggregation_id)) + logging.critical( + "Could not get flowcell at %s (Aggregation %d)" + % (lane["flowcell"], aggregation_id) + ) sys.exit(1) if not flowcell["paired_end"]: paired_ended = None @@ -369,26 +495,46 @@ def add_script(self, aggregation_id, aggregation_folder, library_number): runfile.write("%s/%s\n" % (aggregation_folder, self.qsub_scriptname)) else: runfile.write("cd %s && " % aggregation_folder) - fullname = "%sLN%d_AGG#%d" % (self.qsub_prefix, library_number, aggregation_id) - runfile.write("sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=2000 --parsable --oversubscribe <<__AGG__\n#!/bin/bash\nbash %s\n__AGG__\n\n" % (fullname, fullname, fullname, self.qsub_queue, self.qsub_scriptname)) + fullname = "%sLN%d_AGG#%d" % ( + self.qsub_prefix, + library_number, + aggregation_id, + ) + runfile.write( + "sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=2000 --parsable --oversubscribe <<__AGG__\n#!/bin/bash\nbash %s\n__AGG__\n\n" + % ( + fullname, + fullname, + fullname, + self.qsub_queue, + self.qsub_scriptname, + ) + ) def setup_tag(self, tag_slug): + aggregation_tags = self.api_list_result( + "tagged_object?content_type=126&tag__slug=%s" % tag_slug + ) - aggregation_tags = self.api_list_result("tagged_object?content_type=126&tag__slug=%s" % tag_slug) - - self.setup_aggregations([aggregation_tag["object_id"] for aggregation_tag in aggregation_tags]) + self.setup_aggregations( + [aggregation_tag["object_id"] for aggregation_tag in aggregation_tags] + ) def setup_project(self, project_id): logging.info("Setting up project #%s" % project_id) - aggregations = self.api_list_result("aggregation/?library__sample__project=%s" % project_id) - self.setup_aggregations([a['id'] for a in aggregations]) + aggregations = self.api_list_result( + "aggregation/?library__sample__project=%s" % project_id + ) + self.setup_aggregations([a["id"] for a in aggregations]) def setup_flowcell(self, flowcell_label): logging.info("Setting up flowcell %s" % flowcell_label) - aggregations = self.api_list_result("aggregation/?in_flowcell=%s" % flowcell_label) + aggregations = self.api_list_result( + "aggregation/?in_flowcell=%s" % flowcell_label + ) if not aggregations: - logging.error("%s has no aggregations" % flowcell_label) - self.setup_aggregations([a['id'] for a in aggregations]) + logging.error("%s has no aggregations" % flowcell_label) + self.setup_aggregations([a["id"] for a in aggregations]) def setup_aggregations(self, aggregation_ids): # Deduplicate aggregations so we don't write the same one out twice @@ -402,20 +548,19 @@ def try_setup(agg_id): self.setup_aggregation(agg_id) except Exception: logging.exception("Something went wrong for AG%d" % agg_id) + list(self.pool.map(try_setup, aggregation_ids)) def setup_aggregation(self, aggregation_id): - aggregation = self.get_aggregation_info(aggregation_id) if not aggregation: return False - if aggregation['locked']: + if aggregation["locked"]: logging.warn("Refusing to set up locked aggregation %d" % (aggregation_id)) return False - aggregation_lanes = self.get_aggregation_lanes(aggregation_id) if not aggregation_lanes: @@ -425,7 +570,9 @@ def setup_aggregation(self, aggregation_id): sample_info = self.get_sample_info(aggregation) aggregation_folder = self.set_aggregation_folder(aggregation, library_info) genome_index = self.get_genome_index(aggregation) - genome_index_location = self.get_genome_index_location(aggregation_id, aggregation_lanes) + genome_index_location = self.get_genome_index_location( + aggregation_id, aggregation_lanes + ) flowcell = self.get_example_flowcell(aggregation_id, aggregation_lanes) paired = self.get_all_flowcell_paired(aggregation_id, aggregation_lanes) @@ -439,12 +586,18 @@ def setup_aggregation(self, aggregation_id): files = [] for aggregation_lane in aggregation_lanes: if not aggregation_lane["include"]: - logging.info("Not including lane %s (Aggregation %d)" % (aggregation_lane["lane"], aggregation_id)) + logging.info( + "Not including lane %s (Aggregation %d)" + % (aggregation_lane["lane"], aggregation_id) + ) continue alignment_endpoint = aggregation_lane["alignment"] if not alignment_endpoint: - logging.info("Not including lane %s because no alignment set (Aggregation %d)" % (aggregation_lane["lane"], aggregation_id)) + logging.info( + "Not including lane %s because no alignment set (Aggregation %d)" + % (aggregation_lane["lane"], aggregation_id) + ) missing = True continue @@ -453,7 +606,10 @@ def setup_aggregation(self, aggregation_id): bamfile = self.get_lane_alignments_file(aggregation_id, alignment_id) if not bamfile: - logging.critical("No BAM alignment file for alignment %s for lane %s, skipping (Aggregation %d)" % (alignment_endpoint, aggregation_lane["lane"], aggregation_id)) + logging.critical( + "No BAM alignment file for alignment %s for lane %s, skipping (Aggregation %d)" + % (alignment_endpoint, aggregation_lane["lane"], aggregation_id) + ) missing = True continue else: @@ -463,13 +619,17 @@ def setup_aggregation(self, aggregation_id): if missing: return False - (script_contents, process_template) = self.get_script_template(aggregation_id, aggregation["aggregation_process_template"], self.script_template) + (script_contents, process_template) = self.get_script_template( + aggregation_id, + aggregation["aggregation_process_template"], + self.script_template, + ) if not script_contents: logging.critical("No script contents") return - kit_method = self.api_single_result(url=library_info['kit_method']) + kit_method = self.api_single_result(url=library_info["kit_method"]) env_vars = OrderedDict() @@ -492,23 +652,32 @@ def setup_aggregation(self, aggregation_id): env_vars["UMI"] = None # Set process template env var overrides - if process_template and 'process_variables' in process_template and process_template['process_variables']: + if ( + process_template + and "process_variables" in process_template + and process_template["process_variables"] + ): try: - process_template_variables = json.loads(process_template['process_variables'], - object_pairs_hook=OrderedDict) + process_template_variables = json.loads( + process_template["process_variables"], object_pairs_hook=OrderedDict + ) for var, value in process_template_variables.items(): env_vars[var] = value except ValueError as e: - logging.error("Could not parse process variables for aggregation %d (template %d): '%s'" % - ( - aggregation_id, - self.script_template['id'], - self.script_template['process_variables'] - )) + logging.error( + "Could not parse process variables for aggregation %d (template %d): '%s'" + % ( + aggregation_id, + self.script_template["id"], + self.script_template["process_variables"], + ) + ) return False - logging.debug("Environment Variables:\n%s" % - "\n".join([ "\t%s=%s" % (e,env_vars[e]) for e in env_vars])) + logging.debug( + "Environment Variables:\n%s" + % "\n".join(["\t%s=%s" % (e, env_vars[e]) for e in env_vars]) + ) script_file = os.path.join(aggregation_folder, self.qsub_scriptname) if self.dry_run: @@ -531,7 +700,7 @@ def setup_aggregation(self, aggregation_id): # Set env vars for var, value in env_vars.items(): if value is not None: - script.write("export %s=\"%s\"\n" % (var, value)) + script.write('export %s="%s"\n' % (var, value)) else: script.write("unset %s\n" % var) @@ -547,9 +716,10 @@ def setup_aggregation(self, aggregation_id): return True -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -600,6 +770,7 @@ def main(args = sys.argv): if poptions.flowcell: process.setup_flowcell(poptions.flowcell) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 82f40b71..1ec5cb24 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -6,6 +6,7 @@ import requests import textwrap from collections import OrderedDict + try: from concurrent.futures import ThreadPoolExecutor except ImportError: @@ -13,7 +14,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_options = { "quiet": False, @@ -35,66 +36,129 @@ "auto_aggregate": False, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - - parser.add_argument("--alignment", dest="align_ids", type=int, action="append", - help="Run for this particular alignment.") - parser.add_argument("--flowcell", dest="flowcell_label", - help="Run for this particular flowcell label.") - parser.add_argument("--tag", dest="tag", - help="Run for alignments tagged here.") - parser.add_argument("--project", dest="project", - help="Run for alignments in this project.") - - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - parser.add_argument("--qsub_priority", dest="qsub_priority", type=int, - help="The priority to give scripts we are submitting.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") - parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("--qsub-queue", dest="qsub_queue", - help="Name of the SLURM partition") - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="Don't use any barcode mask.") - parser.add_argument("--bases_mask", dest="bases_mask", - help="Set a bases mask.") - parser.add_argument("--redo_completed", dest="redo_completed", help="Redo alignments marked as completed.", - action="store_true") - parser.add_argument("--auto_aggregate", dest="auto_aggregate", help="Script created will also run auto-aggregations after alignments finished.", - action="store_true") - parser.add_argument("--align_base_dir", dest="align_base_dir", help="Create the alignment directory in this directory") - - parser.add_argument("--listout", dest="simple_output", help="Write only a list of alignments to run, rather than a script to submit them", action="store_true") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + + parser.add_argument( + "--alignment", + dest="align_ids", + type=int, + action="append", + help="Run for this particular alignment.", + ) + parser.add_argument( + "--flowcell", + dest="flowcell_label", + help="Run for this particular flowcell label.", + ) + parser.add_argument("--tag", dest="tag", help="Run for alignments tagged here.") + parser.add_argument( + "--project", dest="project", help="Run for alignments in this project." + ) + + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + parser.add_argument( + "--qsub_priority", + dest="qsub_priority", + type=int, + help="The priority to give scripts we are submitting.", + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this alignment to this file.", + ) + parser.add_argument( + "-b", + "--sample-script-basename", + dest="sample_script_basename", + help="Name of the script that goes after the sample name.", + ) + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument( + "--qsub-queue", dest="qsub_queue", help="Name of the SLURM partition" + ) + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + parser.add_argument( + "--no-mask", + dest="no_mask", + action="store_true", + help="Don't use any barcode mask.", + ) + parser.add_argument("--bases_mask", dest="bases_mask", help="Set a bases mask.") + parser.add_argument( + "--redo_completed", + dest="redo_completed", + help="Redo alignments marked as completed.", + action="store_true", + ) + parser.add_argument( + "--auto_aggregate", + dest="auto_aggregate", + help="Script created will also run auto-aggregations after alignments finished.", + action="store_true", + ) + parser.add_argument( + "--align_base_dir", + dest="align_base_dir", + help="Create the alignment directory in this directory", + ) + + parser.add_argument( + "--listout", + dest="simple_output", + help="Write only a list of alignments to run, rather than a script to submit them", + action="store_true", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - def __init__(self, args, api_url, token): - self.token = token self.api_url = api_url self.qsub_scriptname = args.sample_script_basename @@ -112,14 +176,13 @@ def __init__(self, args, api_url, token): self.simple_output = args.simple_output self.session = requests.Session() - self.session.headers.update({'Authorization': "Token %s" % self.token}) + self.session.headers.update({"Authorization": "Token %s" % self.token}) self.pool = ThreadPoolExecutor(max_workers=10) def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = self.session.get(url) @@ -132,7 +195,6 @@ def api_single_result(self, url_addition=None, url=None): return None def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -140,7 +202,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = self.session.get(url) @@ -158,18 +219,20 @@ def api_list_result(self, url_addition=None, url=None): return results def get_align_process_info(self, alignment_id): - - process_info = self.api_single_result("flowcell_lane_alignment/%d/processing_information/" % alignment_id) + process_info = self.api_single_result( + "flowcell_lane_alignment/%d/processing_information/" % alignment_id + ) if not process_info: - logging.critical("Could not find processing info for alignment %d\n" % alignment_id) + logging.critical( + "Could not find processing info for alignment %d\n" % alignment_id + ) logging.critical(process_info) sys.exit(1) return process_info def get_process_template(self, align_id, process_template_id): - if not process_template_id: logging.critical("No process template for alignment %d\n" % align_id) return None @@ -177,7 +240,9 @@ def get_process_template(self, align_id, process_template_id): info = self.api_single_result("process_template/%d/" % (process_template_id)) if not info: - logging.critical("Could not find processing template for ID %d\n" % process_template_id) + logging.critical( + "Could not find processing template for ID %d\n" % process_template_id + ) sys.exit(1) return info @@ -191,12 +256,13 @@ def setup_alignments(self, align_ids): logging.debug("ALN%d result received, OK" % id) def setup_alignment(self, align_id): - try: processing_info = self.get_align_process_info(align_id) - alignment = self.api_single_result("flowcell_lane_alignment/%d/" % (align_id)) + alignment = self.api_single_result( + "flowcell_lane_alignment/%d/" % (align_id) + ) - if self.redo_completed or not alignment['complete_time']: + if self.redo_completed or not alignment["complete_time"]: self.create_script(processing_info, alignment["id"]) return (align_id, None) else: @@ -207,7 +273,9 @@ def setup_alignment(self, align_id): return (align_id, e) def get_lane_file(self, lane_id, purpose): - candidates = self.api_list_result("file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id)) + candidates = self.api_list_result( + "file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id) + ) if not candidates: return None @@ -217,27 +285,33 @@ def get_lane_file(self, lane_id, purpose): return candidates[0] def setup_tag(self, tag_slug): - - align_tags = self.api_list_result("tagged_object/?content_type=47&tag__slug=%s" % tag_slug) + align_tags = self.api_list_result( + "tagged_object/?content_type=47&tag__slug=%s" % tag_slug + ) self.setup_alignments([align_tag["object_id"] for align_tag in align_tags]) def setup_project(self, project_id): logging.info("Setting up project #%s" % project_id) - alignments = self.api_list_result("flowcell_lane_alignment/?lane__sample__project=%s" % project_id) + alignments = self.api_list_result( + "flowcell_lane_alignment/?lane__sample__project=%s" % project_id + ) self.setup_alignments([alignment["id"] for alignment in alignments]) def setup_flowcell(self, flowcell_label): logging.info("Setting up flowcell for %s" % flowcell_label) - alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + alignments = self.api_list_result( + "flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" + % flowcell_label + ) if self.auto_aggregate: for alignment in alignments: self.setup_alignment(alignment["id"]) - self.auto_aggregation_script(flowcell_label,alignments) + self.auto_aggregation_script(flowcell_label, alignments) else: self.setup_alignments([alignment["id"] for alignment in alignments]) - def auto_aggregation_script(self,flowcell_label,alignments): + def auto_aggregation_script(self, flowcell_label, alignments): aaname_sentinel = "auto_agg_sentinel.%s" % (flowcell_label) if not self.outfile: @@ -245,9 +319,10 @@ def auto_aggregation_script(self,flowcell_label,alignments): outfile = sys.stdout else: logging.debug("Logging script to %s" % self.outfile) - outfile = open(self.outfile, 'a') + outfile = open(self.outfile, "a") - contents = textwrap.dedent("""\ + contents = textwrap.dedent( + """\ cd "$FLOWCELLS"/FC{label}_* sentinel_dependencies=$(echo $PROCESSING | sed -e 's/,/,afterany:/g' | sed -e 's/^,afterany/--dependency=afterany/g') sbatch --export=ALL -J {job_name} -o {job_name}.o%A -e {job_name}.e%A --partition={queue} --cpus-per-task=1 --ntasks=1 $sentinel_dependencies --mem-per-cpu=1000 --parsable --oversubscribe <<__AUTOAGG1__ @@ -256,16 +331,18 @@ def auto_aggregation_script(self,flowcell_label,alignments): python $STAMPIPES/scripts/aggregateprocess.py --flowcell {label} --outfile run_aggregations.bash --qsub-queue {qqueue} bash run_aggregations.bash __AUTOAGG1__ - """.format(label=flowcell_label, - job_name=aaname_sentinel, - queue=self.qsub_queue, - qqueue=self.qsub_queue)) + """.format( + label=flowcell_label, + job_name=aaname_sentinel, + queue=self.qsub_queue, + qqueue=self.qsub_queue, + ) + ) outfile.write(contents) outfile.close() def add_script(self, align_id, processing_info, script_file, sample_name): - ram_megabytes = 2000 if not self.outfile: @@ -273,26 +350,41 @@ def add_script(self, align_id, processing_info, script_file, sample_name): outfile = sys.stdout else: logging.debug("Logging script to %s" % self.outfile) - outfile = open(self.outfile, 'a') + outfile = open(self.outfile, "a") if self.simple_output: outfile.write(script_file + "\n") else: outfile.write("cd %s && " % os.path.dirname(script_file)) - fullname = "%s%s-%s-ALIGN#%d" % (self.qsub_prefix,sample_name,processing_info['flowcell']['label'],align_id) - outfile.write("jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING=\"$PROCESSING,$jobid\"\n\n" % (fullname, fullname, fullname, self.qsub_queue, ram_megabytes, script_file)) + fullname = "%s%s-%s-ALIGN#%d" % ( + self.qsub_prefix, + sample_name, + processing_info["flowcell"]["label"], + align_id, + ) + outfile.write( + 'jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING="$PROCESSING,$jobid"\n\n' + % ( + fullname, + fullname, + fullname, + self.qsub_queue, + ram_megabytes, + script_file, + ) + ) outfile.close() def get_script_template(self, process_template): - if self.script_template: script_path = self.script_template else: - script_path = os.path.expandvars(process_template["process_version"]["script_location"]) - return open(script_path, 'r').read() + script_path = os.path.expandvars( + process_template["process_version"]["script_location"] + ) + return open(script_path, "r").read() def create_script(self, processing_info, align_id): - lane = processing_info["libraries"][0] alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] @@ -300,29 +392,46 @@ def create_script(self, processing_info, align_id): logging.error("Alignment %d has no process template" % align_id) return False - process_template = self.get_process_template(align_id, alignment["process_template"]) + process_template = self.get_process_template( + align_id, alignment["process_template"] + ) if not process_template: return False - flowcell_directory = processing_info['flowcell']['directory'] + flowcell_directory = processing_info["flowcell"]["directory"] share_dir = lane.get("project_share_directory") if share_dir: flowcell_directory = os.path.join(share_dir, "alignments") if not flowcell_directory: - logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) + logging.error( + "Alignment %d has no flowcell directory for flowcell %s" + % (align_id, processing_info["flowcell"]["label"]) + ) return False - fastq_directory = os.path.join(flowcell_directory, "Project_%s" % lane['project'], "Sample_%s" % lane['samplesheet_name']) + fastq_directory = os.path.join( + flowcell_directory, + "Project_%s" % lane["project"], + "Sample_%s" % lane["samplesheet_name"], + ) # Reset the alignment's sample name if we decied not to use the barcode index mask if self.no_mask: - alignment['sample_name'] = "%s_%s_L00%d" % (lane['samplesheet_name'], lane['barcode_index'], lane['lane']) - - align_dir = "align_%d_%s_%s" % (alignment['id'], alignment['genome_index'], alignment['aligner']) - if alignment['aligner_version']: - align_dir = "%s-%s" % (align_dir, alignment['aligner_version']) + alignment["sample_name"] = "%s_%s_L00%d" % ( + lane["samplesheet_name"], + lane["barcode_index"], + lane["lane"], + ) + + align_dir = "align_%d_%s_%s" % ( + alignment["id"], + alignment["genome_index"], + alignment["aligner"], + ) + if alignment["aligner_version"]: + align_dir = "%s-%s" % (align_dir, alignment["aligner_version"]) script_directory = os.path.join(fastq_directory, align_dir) if self.align_base_dir: @@ -331,83 +440,105 @@ def create_script(self, processing_info, align_id): r1_fastq = self.get_lane_file(lane["id"], "r1-fastq") if not r1_fastq: - logging.error("Missing r1-fastq for lane %d (alignment %d) - check dir %s" % (lane["id"], alignment["id"], fastq_directory)) + logging.error( + "Missing r1-fastq for lane %d (alignment %d) - check dir %s" + % (lane["id"], alignment["id"], fastq_directory) + ) return False - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: r2_fastq = self.get_lane_file(lane["id"], "r2-fastq") if not r2_fastq: - logging.error("Missing r2-fastq for lane %d (alignment %d)" % (lane["id"], alignment["id"])) + logging.error( + "Missing r2-fastq for lane %d (alignment %d)" + % (lane["id"], alignment["id"]) + ) return False - script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) + script_file = os.path.join( + script_directory, "%s-%s" % (alignment["sample_name"], self.qsub_scriptname) + ) logging.info("Will write to %s" % script_file) - # Set up & add environment variables env_vars = OrderedDict() - env_vars["SAMPLE_NAME"] = alignment['sample_name'] - env_vars["BWAINDEX"] = alignment['genome_index_location'] - env_vars["GENOME"] = alignment['genome_index'] - env_vars["ASSAY"] = lane['assay'] - env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] - if processing_info['libraries'] and processing_info['libraries'][0] and processing_info['libraries'][0].get('library_kit_method'): - env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' + env_vars["SAMPLE_NAME"] = alignment["sample_name"] + env_vars["BWAINDEX"] = alignment["genome_index_location"] + env_vars["GENOME"] = alignment["genome_index"] + env_vars["ASSAY"] = lane["assay"] + env_vars["READLENGTH"] = processing_info["flowcell"]["read_length"] + if ( + processing_info["libraries"] + and processing_info["libraries"][0] + and processing_info["libraries"][0].get("library_kit_method") + ): + env_vars["LIBRARY_KIT"] = ( + '"' + processing_info["libraries"][0]["library_kit_method"] + '"' + ) else: env_vars["LIBRARY_KIT"] = None - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: env_vars["PAIRED"] = "True" else: env_vars["PAIRED"] = None - env_vars["FLOWCELL_LANE_ID"] = lane['id'] - env_vars["ALIGNMENT_ID"] = alignment['id'] - env_vars["ALIGN_DIR"] = os.path.join(fastq_directory, align_dir) - env_vars["R1_FASTQ"] = r1_fastq["path"] + env_vars["FLOWCELL_LANE_ID"] = lane["id"] + env_vars["ALIGNMENT_ID"] = alignment["id"] + env_vars["ALIGN_DIR"] = os.path.join(fastq_directory, align_dir) + env_vars["R1_FASTQ"] = r1_fastq["path"] - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: env_vars["R2_FASTQ"] = r2_fastq["path"] env_vars["FASTQ_DIR"] = fastq_directory - env_vars["FLOWCELL"] = processing_info['flowcell']['label'] + env_vars["FLOWCELL"] = processing_info["flowcell"]["label"] if "barcode1" in lane and lane["barcode1"]: - p7_adapter = lane['barcode1']['adapter7'] - p5_adapter = lane['barcode1']['adapter5'] - if "barcode2" in lane and lane['barcode2']: + p7_adapter = lane["barcode1"]["adapter7"] + p5_adapter = lane["barcode1"]["adapter5"] + if "barcode2" in lane and lane["barcode2"]: # Override the "default" end adapter from barcode1 - p5_adapter = lane['barcode2']['adapter5_reverse_complement'] + p5_adapter = lane["barcode2"]["adapter5_reverse_complement"] if not p7_adapter or not p5_adapter: - logging.warn("Alignment %d missing adapters, some processes might not work" % alignment['id']) + logging.warn( + "Alignment %d missing adapters, some processes might not work" + % alignment["id"] + ) env_vars["ADAPTER_P7"] = p7_adapter env_vars["ADAPTER_P5"] = p5_adapter # Process with UMI if the barcode has one and this is a dual index # flowcell - if lane['barcode1']['umi'] and processing_info['flowcell']['dual_index']: + if lane["barcode1"]["umi"] and processing_info["flowcell"]["dual_index"]: env_vars["UMI"] = "True" else: env_vars["UMI"] = None - env_vars["UMI_METHOD"] = lane['barcode1']['umi_method'] + env_vars["UMI_METHOD"] = lane["barcode1"]["umi_method"] # Set process template env var overrides - if 'process_variables' in process_template and process_template['process_variables']: + if ( + "process_variables" in process_template + and process_template["process_variables"] + ): try: - process_template_variables = json.loads(process_template['process_variables'], - object_pairs_hook=OrderedDict) + process_template_variables = json.loads( + process_template["process_variables"], object_pairs_hook=OrderedDict + ) for var, value in process_template_variables.items(): env_vars[var] = value except ValueError as e: - logging.error("Could not parse process variables for align %d (template %d): '%s'" % - ( - alignment['id'], - process_template['id'], - process_template['process_variables'] - )) + logging.error( + "Could not parse process variables for align %d (template %d): '%s'" + % ( + alignment["id"], + process_template["id"], + process_template["process_variables"], + ) + ) return False if self.dry_run: @@ -420,10 +551,12 @@ def create_script(self, processing_info, align_id): os.makedirs(script_directory) # Append to master script - self.add_script(align_id, processing_info, script_file, alignment['sample_name']) + self.add_script( + align_id, processing_info, script_file, alignment["sample_name"] + ) # Write file - outfile = open(script_file, 'w') + outfile = open(script_file, "w") outfile.write("set -e -o pipefail\n") # Set env vars @@ -440,9 +573,9 @@ def create_script(self, processing_info, align_id): outfile.close() -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index b37199d2..40f06cfe 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -83,17 +83,17 @@ def parser_setup(): "-t", "--token", dest="token", help="Your authentication token." ) - #run_opts.add_argument("sample_config", help="The sample_config.tsv file") - #run_opts.add_argument("processing_json", help="The processing.json file") - #run_opts.add_argument( - #"--output_file_directory", - #default=".", - #help="The output directory files are stored in. Defaults to cwd.", - #) + # run_opts.add_argument("sample_config", help="The sample_config.tsv file") + # run_opts.add_argument("processing_json", help="The processing.json file") + # run_opts.add_argument( + # "--output_file_directory", + # default=".", + # help="The output directory files are stored in. Defaults to cwd.", + # ) run_opts.add_argument("--r1", dest="r1_fastq", help="the r1 file to upload") run_opts.add_argument("--r2", dest="r2_fastq", help="the r2 file to upload") run_opts.add_argument("--lane", dest="lane_id", help="the ID of the lane") - #run_opts.add_argument("--flowcell", dest="flowcell_name", help="the name of the flowcell") + # run_opts.add_argument("--flowcell", dest="flowcell_name", help="the name of the flowcell") run_opts.add_argument( "--skip_md5", @@ -127,7 +127,7 @@ def md5sum_file(path): return md5sum.hexdigest() -#def parse_counts_file(counts_file: str): +# def parse_counts_file(counts_file: str): # """ # Given a file name, reads a stats file # format: one stat per line: `name value` (separated by whitespace) @@ -145,7 +145,7 @@ def md5sum_file(path): # return stats # # -#def build_counts(alignment_id, counts_file): +# def build_counts(alignment_id, counts_file): # """ # Convert stats into a form ready to be uploaded to LIMS with the # bulk-stat-create endpoint @@ -173,7 +173,7 @@ def __init__(self, api_url, token, dry_run=False, skip_md5=False): { rest.LIMS_URL_OPT_VAR: api_url, rest.LIMS_TOKEN_OPT_VAR: token, - #rest.RAISE_ON_ERROR_VAR: True, + # rest.RAISE_ON_ERROR_VAR: True, } ) self.dry_run = dry_run @@ -343,8 +343,20 @@ def upload_directory_attachment( def upload_files(self, r1, r2, lane_id): lane_ids = self.get_lane_ids(lane_id) - self.upload_file(r1, "SequencingData.flowcelllane", lane_ids, file_purpose="r1-fastq", file_type="gzipped-fastq") - self.upload_file(r2, "SequencingData.flowcelllane", lane_ids, file_purpose="r2-fastq", file_type="gzipped-fastq") + self.upload_file( + r1, + "SequencingData.flowcelllane", + lane_ids, + file_purpose="r1-fastq", + file_type="gzipped-fastq", + ) + self.upload_file( + r2, + "SequencingData.flowcelllane", + lane_ids, + file_purpose="r2-fastq", + file_type="gzipped-fastq", + ) def upload_file( self, path, contenttype_name, object_ids, file_purpose=None, file_type=None @@ -394,7 +406,12 @@ def upload_file( ) result = self.put(url=exists["url"], data=upload_data) else: - LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + LOG.info( + "Uploading information for file %s: lane %d, data=%s", + path, + object_id, + upload_data, + ) result = self.post("file/", data=upload_data) if not result: @@ -461,20 +478,22 @@ def get_library(self, library_id): """Gets the library by ID (NOT library number)""" return self.get_by_id("library", library_id) - # gets the other lane ids for this lane/pool def get_lane_ids(self, lane_id): def extract_id_from_url(url): - return re.sub(r'[^\d]', "", url) + return re.sub(r"[^\d]", "", url) + lane_info = self.get_by_id("flowcell_lane", int(lane_id)) logging.info("lane %s info:\n%s", lane_id, lane_info) - assert lane_info["library_pool"] is not None, "library_pool for lane %s must not be None" % lane_id + assert lane_info["library_pool"] is not None, ( + "library_pool for lane %s must not be None" % lane_id + ) pool_info = self.api.get_single_result(url=lane_info["library_pool"]) lib_ids = [] flowcell_id = extract_id_from_url(lane_info["flowcell"]) for lib_url in pool_info["libraries"]: - lib_id = extract_id_from_url(lib_url) - lib_ids.append(lib_id) + lib_id = extract_id_from_url(lib_url) + lib_ids.append(lib_id) lanes_query = "flowcell_lane/?flowcell=%s&lane=%d&page_size=1000" % ( flowcell_id, @@ -490,8 +509,7 @@ def extract_id_from_url(url): lanes_in_pool.add(l["id"]) return list(lanes_in_pool) - - #def upload_flowcell_report(self, data): + # def upload_flowcell_report(self, data): # flowcell_labels = set(pool["flowcell_label"] for pool in data) # assert len(flowcell_labels) == 1 # flowcell_label = flowcell_labels.pop() @@ -533,8 +551,7 @@ def extract_id_from_url(url): # LOG.critical("Too many JSON reports exist") # raise "Too many JSON reports exist, exiting" - - #def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): + # def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): # """ # Main function for this script. # Given paths to the sample_config file, processing_dict, and outdir, @@ -660,13 +677,13 @@ def main(): uploader.upload_files(poptions.r1_fastq, poptions.r2_fastq, poptions.lane_id) - #with open(poptions.sample_config) as f: + # with open(poptions.sample_config) as f: # sample_config = list(csv.DictReader(f, delimiter="\t")) - #with open(poptions.processing_json) as f: + # with open(poptions.processing_json) as f: # processing = json.loads(f.read()) - #uploader.upload_altseq_flowcell( + # uploader.upload_altseq_flowcell( # sample_config, processing, poptions.output_file_directory - #) + # ) # This is the main body of the program that only runs when running this script diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py index 27d51aa0..b1a58a44 100644 --- a/scripts/altcode/upload_stats.py +++ b/scripts/altcode/upload_stats.py @@ -88,20 +88,20 @@ def parser_setup(): # TODO: Should we allow registering each align dir by itself? run_opts.add_argument("flowcell_dir", help="The flowcell directory") - #run_opts.add_argument("sample_config", help="The sample_config.tsv file") - #run_opts.add_argument("processing_json", help="The processing.json file") - #run_opts.add_argument( + # run_opts.add_argument("sample_config", help="The sample_config.tsv file") + # run_opts.add_argument("processing_json", help="The processing.json file") + # run_opts.add_argument( # "--output_file_directory", # default=".", # help="The output directory files are stored in. Defaults to cwd.", - #) + # ) - #run_opts.add_argument( + # run_opts.add_argument( # "--skip_md5", # dest="skip_md5", # action="store_true", # help="Don't calculate md5sum (debug/dev only)", - #) + # ) run_opts.add_argument( "-n", @@ -174,7 +174,7 @@ def __init__(self, api_url, token, dry_run=False, skip_md5=False): { rest.LIMS_URL_OPT_VAR: api_url, rest.LIMS_TOKEN_OPT_VAR: token, - #rest.RAISE_ON_ERROR_VAR: True, + # rest.RAISE_ON_ERROR_VAR: True, } ) self.dry_run = dry_run @@ -390,7 +390,12 @@ def upload_file( ) result = self.put(url=exists["url"], data=upload_data) else: - LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + LOG.info( + "Uploading information for file %s: lane %d, data=%s", + path, + object_id, + upload_data, + ) result = self.post("file/", data=upload_data) if not result: @@ -465,20 +470,25 @@ def upload_flowcell_report(self, data): report_name = "Alt-code stats: FC%s" % flowcell_label flowcell_lims_info = self.get_single_result( - "flowcell_run/?label=%s" % flowcell_label) - content_type_id = flowcell_lims_info['object_content_type'] + "flowcell_run/?label=%s" % flowcell_label + ) + content_type_id = flowcell_lims_info["object_content_type"] content_type = self.get_by_id("content_type", content_type_id) - object_id = flowcell_lims_info['id'] + object_id = flowcell_lims_info["id"] json_report_class = self.get_single_result( - "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) + "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG} + ) # See if report already exists - existing_reports = self.get_list_result("json_report/", query={ - "object_id": object_id, - "content_type": content_type["id"], - "report_class": json_report_class["id"], - "page_size": 2, - }) + existing_reports = self.get_list_result( + "json_report/", + query={ + "object_id": object_id, + "content_type": content_type["id"], + "report_class": json_report_class["id"], + "page_size": 2, + }, + ) data_to_send = { "object_id": object_id, @@ -499,7 +509,6 @@ def upload_flowcell_report(self, data): LOG.critical("Too many JSON reports exist") raise "Too many JSON reports exist, exiting" - def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): """ Main function for this script. @@ -547,7 +556,7 @@ def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) # Upload files. - for ((path, purpose), lane_ids) in files_to_upload.items(): + for (path, purpose), lane_ids in files_to_upload.items(): # print(path, purpose, len(lane_ids)) self.upload_file( path, @@ -581,6 +590,7 @@ def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): flowcell_data = json.loads(json_file.read()) self.upload_flowcell_report(flowcell_data) + def find_stat_files_in_dir(flowcell_directory): """ Given a directory to search, finds the newest alt-code stats files @@ -598,7 +608,10 @@ def find_stat_files_in_dir(flowcell_directory): (?P[a-z]+)? [_-]? (?P[0-9.]*) - """, re.VERBOSE | re.IGNORECASE) + """, + re.VERBOSE | re.IGNORECASE, + ) + def sortkey(a): # Logic: there are up to 3 parts # 1) Regular version, like "2.0" or "3.2.1". @@ -606,9 +619,11 @@ def sortkey(a): # 3) teeny suffix after alpha, like "3" or "1" match = version_regex.match(os.path.basename(a)) versn = match.group("versn") if match and match.group("versn") else 0 - greek = match.group("greek") if match and match.group("greek") else "zzzzzz" # last + greek = ( + match.group("greek") if match and match.group("greek") else "zzzzzz" + ) # last teeny = match.group("teeny") if match and match.group("teeny") else 0 - length = -1 * len(a) # Prefer shorter names to longer + length = -1 * len(a) # Prefer shorter names to longer return (LooseVersion(versn), greek, LooseVersion(teeny), length) for align_dir in align_dirs: @@ -664,7 +679,9 @@ def main(): sys.exit(1) uploader = UploadLIMS( - api_url, token, dry_run=poptions.dry_run, + api_url, + token, + dry_run=poptions.dry_run, ) stats_files = find_stat_files_in_dir(poptions.flowcell_dir) @@ -680,14 +697,13 @@ def main(): uploader.upload_flowcell_report(all_stats) - - #with open(poptions.sample_config) as f: + # with open(poptions.sample_config) as f: # sample_config = list(csv.DictReader(f, delimiter="\t")) - #with open(poptions.processing_json) as f: + # with open(poptions.processing_json) as f: # processing = json.loads(f.read()) - #uploader.upload_altcode_flowcell( + # uploader.upload_altcode_flowcell( # sample_config, processing, poptions.output_file_directory - #) + # ) # This is the main body of the program that only runs when running this script diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index a29e64a3..eb2bbfec 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -385,7 +385,12 @@ def upload_file( ) result = self.put(url=exists["url"], data=upload_data) else: - LOG.info("Uploading information for file %s: lane %d, data=%s", path, object_id, upload_data) + LOG.info( + "Uploading information for file %s: lane %d, data=%s", + path, + object_id, + upload_data, + ) result = self.post("file/", data=upload_data) if not result: @@ -460,27 +465,32 @@ def upload_flowcell_report(self, data): report_name = "Alt-seq stats: FC%s" % flowcell_label flowcell_lims_info = self.get_single_result( - "flowcell_run/?label=%s" % flowcell_label) - content_type_id = flowcell_lims_info['object_content_type'] + "flowcell_run/?label=%s" % flowcell_label + ) + content_type_id = flowcell_lims_info["object_content_type"] content_type = self.get_by_id("content_type", content_type_id) - object_id = flowcell_lims_info['id'] + object_id = flowcell_lims_info["id"] json_report_class = self.get_single_result( - "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG}) + "json_report_class/", query={"slug": JSON_REPORT_CLASS_SLUG} + ) # See if report already exists - existing_reports = self.get_list_result("json_report/", query={ - "object_id": object_id, - "content_type": content_type["id"], - "report_class": json_report_class["id"], - "page_size": 2, - }) + existing_reports = self.get_list_result( + "json_report/", + query={ + "object_id": object_id, + "content_type": content_type["id"], + "report_class": json_report_class["id"], + "page_size": 2, + }, + ) data_to_send = { - "object_id": object_id, - "content_type": content_type["url"], - "report_class": json_report_class["url"], - "name": report_name, - "json_content": json.dumps(data), + "object_id": object_id, + "content_type": content_type["url"], + "report_class": json_report_class["url"], + "name": report_name, + "json_content": json.dumps(data), } if len(existing_reports) == 0: self.post("json_report/", data=data_to_send) @@ -494,7 +504,6 @@ def upload_flowcell_report(self, data): LOG.critical("Too many JSON reports exist") raise "Too many JSON reports exist, exiting" - def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): """ Main function for this script. @@ -542,7 +551,7 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): files_to_upload[(r2_file, "r2-fastq")].extend(lane_ids) # Upload files. - for ((path, purpose), lane_ids) in files_to_upload.items(): + for (path, purpose), lane_ids in files_to_upload.items(): # print(path, purpose, len(lane_ids)) self.upload_file( path, diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index d2be1164..2c15e376 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -19,7 +19,7 @@ POOL_INFO = {} SCRIPTS_WRITTEN = set() -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_options = { "quiet": False, @@ -36,50 +36,80 @@ "tag_slug": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") - parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") - parser.add_argument("--lane", dest="lane_ids", type=int, action="append", - help="Lane ID") - - parser.add_argument("--flowcell_label", dest="flowcell_label", help="Flowcell Label") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this alignment to this file.", + ) + parser.add_argument( + "-b", + "--sample-script-basename", + dest="sample_script_basename", + help="Name of the script that goes after the sample name.", + ) + parser.add_argument( + "--lane", dest="lane_ids", type=int, action="append", help="Lane ID" + ) + + parser.add_argument( + "--flowcell_label", dest="flowcell_label", help="Flowcell Label" + ) parser.add_argument("--tag", dest="tag", help="Lanes tagged by") - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("--queue", dest="queue", - help="SLURM partition for jobs.") - - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="Don't use any barcode mask.") - parser.add_argument("--bases_mask", dest="bases_mask", - help="Set a bases mask.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument("--queue", dest="queue", help="SLURM partition for jobs.") + + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + parser.add_argument( + "--no-mask", + dest="no_mask", + action="store_true", + help="Don't use any barcode mask.", + ) + parser.add_argument("--bases_mask", dest="bases_mask", help="Set a bases mask.") + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - - def __init__(self, args, api): - + def __init__(self, args, api): self.api = api self.qsub_scriptname = args.sample_script_basename self.qsub_prefix = args.qsub_prefix @@ -92,8 +122,9 @@ def __init__(self, args, api): self.pool = ThreadPoolExecutor(max_workers=6) def get_lane_process_info(self, lane_id): - - info = self.api.get_single_result(url_addition="flowcell_lane/%d/processing_information" % (lane_id)) + info = self.api.get_single_result( + url_addition="flowcell_lane/%d/processing_information" % (lane_id) + ) if info: return info @@ -102,35 +133,53 @@ def get_lane_process_info(self, lane_id): sys.exit(1) def get_process_template(self, process_template_id): - if not process_template_id: - logging.critical("No process template for alignment %d\n" % self.alignment_id) + logging.critical( + "No process template for alignment %d\n" % self.alignment_id + ) sys.exit(1) - info = self.api.get_single_result(url_addition="process_template/%d" % (process_template_id)) + info = self.api.get_single_result( + url_addition="process_template/%d" % (process_template_id) + ) if info: return info else: - logging.error("Could not find processing template for ID %d\n" % process_template_id) + logging.error( + "Could not find processing template for ID %d\n" % process_template_id + ) sys.exit(1) def setup_flowcell(self, flowcell_label): - - lanes = self.api.get_list_result(url_addition="flowcell_lane/", query_arguments={"flowcell__label": flowcell_label}, page_size=1000, item_limit=50000) + lanes = self.api.get_list_result( + url_addition="flowcell_lane/", + query_arguments={"flowcell__label": flowcell_label}, + page_size=1000, + item_limit=50000, + ) if not lanes: logging.error("Flowcell %s has no lanes" % flowcell_label) return - logging.debug("Setting up flowcell %s with %d lanes" % (flowcell_label, len(lanes))) + logging.debug( + "Setting up flowcell %s with %d lanes" % (flowcell_label, len(lanes)) + ) self.setup_lanes([lane["id"] for lane in lanes]) def setup_tag(self, tag_slug): - - flowcelllane_contenttype = content_types.contenttype_from_model_name(self.api, model_name="FlowcellLane") - lane_tags = self.api.get_list_result(url_addition="tagged_object", query_arguments={"content_type": flowcelllane_contenttype["id"], "tag__slug": tag_slug}) + flowcelllane_contenttype = content_types.contenttype_from_model_name( + self.api, model_name="FlowcellLane" + ) + lane_tags = self.api.get_list_result( + url_addition="tagged_object", + query_arguments={ + "content_type": flowcelllane_contenttype["id"], + "tag__slug": tag_slug, + }, + ) if not lane_tags: logging.error("Tag %s has no lanes" % lane_tags) @@ -143,30 +192,41 @@ def setup_lanes(self, lane_ids): logging.debug("Setting up lane IDs %s" % str(lane_ids)) if len(lane_ids) != len(set(lane_ids)): - logging.warning("Duplicate lane IDs! %s " % [item for item, count in collections.Counter(lane_ids).items() if count > 1]) - - #self.pool.map(self.setup_lane, lane_ids) + logging.warning( + "Duplicate lane IDs! %s " + % [ + item + for item, count in collections.Counter(lane_ids).items() + if count > 1 + ] + ) + + # self.pool.map(self.setup_lane, lane_ids) for lane_id in lane_ids: self.setup_lane(lane_id) def setup_lane(self, lane_id): - logging.debug("Setting up lane %d" % lane_id) processing_info = self.get_lane_process_info(lane_id) pool_name = None - if (len(processing_info.get("libraries", [])) == 1 - and processing_info["libraries"][0].get("samplesheet_name",'').startswith("LP")): - pool_name = processing_info['libraries'][0]['samplesheet_name'] - pool_number = int(pool_name[2:]) # remove leading LP - pool_data = self.api.get_single_result(url_addition="library_pool/?number=%d" % pool_number)["results"][0] + if len(processing_info.get("libraries", [])) == 1 and processing_info[ + "libraries" + ][0].get("samplesheet_name", "").startswith("LP"): + pool_name = processing_info["libraries"][0]["samplesheet_name"] + pool_number = int(pool_name[2:]) # remove leading LP + pool_data = self.api.get_single_result( + url_addition="library_pool/?number=%d" % pool_number + )["results"][0] pool_id = pool_data["id"] else: try: lib_number = processing_info["libraries"][0]["library"] - library_info = self.api.get_single_result(url_addition="library/?number=%d" % lib_number)["results"][0] + library_info = self.api.get_single_result( + url_addition="library/?number=%d" % lib_number + )["results"][0] logging.debug("Info is %s", library_info) pools = library_info["librarypools"] if pools: @@ -180,20 +240,25 @@ def setup_lane(self, lane_id): global POOL_INFO if pool_name and pool_name not in POOL_INFO: - pool_data = self.api.get_single_result(url_addition="library_pool/%d/" % pool_id) + pool_data = self.api.get_single_result( + url_addition="library_pool/%d/" % pool_id + ) bc1 = None bc2 = None if pool_data["barcode1"]: - bc1 = self.api.get_single_result(url=pool_data["barcode1"])["reverse_sequence"] + bc1 = self.api.get_single_result(url=pool_data["barcode1"])[ + "reverse_sequence" + ] if pool_data["barcode2"]: - bc2 = self.api.get_single_result(url=pool_data["barcode2"])["reverse_sequence"] + bc2 = self.api.get_single_result(url=pool_data["barcode2"])[ + "reverse_sequence" + ] barcode = "-".join(bc for bc in [bc1, bc2] if bc) POOL_INFO[pool_name] = {"barcode": barcode} self.create_script(processing_info, pool_name) def add_script(self, script_file, lane_id, flowcell_label, sample_name): - # Hacks to deduplicate files written for library pools global SCRIPTS_WRITTEN if script_file in SCRIPTS_WRITTEN: @@ -205,19 +270,25 @@ def add_script(self, script_file, lane_id, flowcell_label, sample_name): outfile = sys.stdout else: logging.debug("Logging script to %s" % self.outfile) - outfile = open(self.outfile, 'a') + outfile = open(self.outfile, "a") outfile.write("cd %s && " % os.path.dirname(script_file)) - fullname = "%s%s-%s-Lane#%d" % (self.qsub_prefix, sample_name, flowcell_label, lane_id) - outfile.write("sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=8000 --parsable --oversubscribe <<__LANEPROC__\n#!/bin/bash\nbash %s\n__LANEPROC__\n\n" % (fullname, fullname, fullname, self.queue, script_file)) + fullname = "%s%s-%s-Lane#%d" % ( + self.qsub_prefix, + sample_name, + flowcell_label, + lane_id, + ) + outfile.write( + "sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=8000 --parsable --oversubscribe <<__LANEPROC__\n#!/bin/bash\nbash %s\n__LANEPROC__\n\n" + % (fullname, fullname, fullname, self.queue, script_file) + ) outfile.close() def get_script_template(self): - - return open(self.script_template, 'r').read() + return open(self.script_template, "r").read() def create_script(self, processing_info, pool=None): - lane = processing_info["libraries"][0] if not "directory" in lane: @@ -226,75 +297,103 @@ def create_script(self, processing_info, pool=None): fastq_directory = lane["directory"] alt_dir = lane.get("project_share_directory", "") if alt_dir: - fastq_directory = os.path.join(alt_dir, "fastq", "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"]) + fastq_directory = os.path.join( + alt_dir, + "fastq", + "Project_%s" % lane["project"], + "Sample_%s" % lane["samplesheet_name"], + ) if pool: flowcell_dir = re.sub(r"/Project.*", "", lane["directory"]) if alt_dir: - flowcell_dir=alt_dir - fastq_directory = os.path.join(flowcell_dir, "Project_%s" % lane["project"], "LibraryPool_%s" % pool) + flowcell_dir = alt_dir + fastq_directory = os.path.join( + flowcell_dir, "Project_%s" % lane["project"], "LibraryPool_%s" % pool + ) - barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] + barcode = "NoIndex" if lane["barcode_index"] is None else lane["barcode_index"] try: # Preferred name - spreadsheet_name = lane['alignments'][0]['sample_name'] + spreadsheet_name = lane["alignments"][0]["sample_name"] except (KeyError, IndexError): # Fallback method, doesn't always have the same barcode string - spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) - logging.warning("No alignment sample_name for lane, using %s instead" % spreadsheet_name) + spreadsheet_name = "%s_%s_L00%d" % ( + lane["samplesheet_name"], + barcode, + lane["lane"], + ) + logging.warning( + "No alignment sample_name for lane, using %s instead" % spreadsheet_name + ) if pool: global POOL_INFO barcode = POOL_INFO[pool]["barcode"] - spreadsheet_name = "%s_%s_L00%d" % (pool, barcode, lane['lane']) - #print("DBG:", pool, spreadsheet_name, POOL_INFO) + spreadsheet_name = "%s_%s_L00%d" % (pool, barcode, lane["lane"]) + # print("DBG:", pool, spreadsheet_name, POOL_INFO) if not os.path.exists(fastq_directory): - logging.critical("fastq directory %s does not exist, cannot continue" % fastq_directory) + logging.critical( + "fastq directory %s does not exist, cannot continue" % fastq_directory + ) return False - script_file = os.path.join( fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) ) + script_file = os.path.join( + fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) + ) if self.dry_run: logging.info("Dry run, would have created: %s" % script_file) return True try: - outfile = open(script_file, 'w') + outfile = open(script_file, "w") except FileNotFoundError: logging.critical("Could not create script file %s" % script_file) return False - self.add_script(script_file, lane["id"], processing_info["flowcell"]["label"], spreadsheet_name) + self.add_script( + script_file, + lane["id"], + processing_info["flowcell"]["label"], + spreadsheet_name, + ) outfile.write("set -e -o pipefail\n") outfile.write("export SAMPLE_NAME=%s\n" % spreadsheet_name) - outfile.write("export ASSAY=%s\n" % lane['assay']) - outfile.write("export READLENGTH=%s\n" % processing_info['flowcell']['read_length']) - if processing_info['flowcell']['paired_end']: + outfile.write("export ASSAY=%s\n" % lane["assay"]) + outfile.write( + "export READLENGTH=%s\n" % processing_info["flowcell"]["read_length"] + ) + if processing_info["flowcell"]["paired_end"]: outfile.write("export PAIRED=True\n") else: outfile.write("unset PAIRED\n") # Process with UMI if the barcode has one and this is a dual index # flowcell - if lane['barcode1'] and lane['barcode1']['umi'] and processing_info['flowcell']['dual_index']: + if ( + lane["barcode1"] + and lane["barcode1"]["umi"] + and processing_info["flowcell"]["dual_index"] + ): outfile.write("export UMI=True\n") else: outfile.write("unset UMI\n") - outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane['id']) + outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane["id"]) outfile.write("export FASTQ_DIR=%s\n" % fastq_directory) - outfile.write("export FLOWCELL=%s\n" % processing_info['flowcell']['label']) + outfile.write("export FLOWCELL=%s\n" % processing_info["flowcell"]["label"]) outfile.write("\n") outfile.write(self.get_script_template()) outfile.close() -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -309,7 +408,7 @@ def main(args = sys.argv): logging.basicConfig(level=logging.INFO, format=log_format) logging.getLogger("requests").setLevel(logging.WARNING) - api = rest.setup_api() + api = rest.setup_api() process = ProcessSetUp(poptions, api) @@ -322,6 +421,7 @@ def main(args = sys.argv): if poptions.tag: process.setup_tag(poptions.tag) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/bam/bamfaiordercompare.py b/scripts/bam/bamfaiordercompare.py index 093be238..a521eff2 100644 --- a/scripts/bam/bamfaiordercompare.py +++ b/scripts/bam/bamfaiordercompare.py @@ -9,21 +9,22 @@ import pysam + def compare_bam_order(faifile, bamfile): - fai = open(faifile, 'r').read() + fai = open(faifile, "r").read() - bam = pysam.Samfile( bamfile, "rb") + bam = pysam.Samfile(bamfile, "rb") bamorder = list(bam.references) - faiorder = [line.split('\t')[0] for line in fai.split('\n') if line] + faiorder = [line.split("\t")[0] for line in fai.split("\n") if line] if bamorder == faiorder: return True return False -def main(args = sys.argv): +def main(args=sys.argv): if len(sys.argv) < 3: - print "USAGE: %s FAIFILE BAMFILE" % sys.argv[0] + print("USAGE: %s FAIFILE BAMFILE" % sys.argv[0]) sys.exit(0) faifile = sys.argv[1] @@ -44,5 +45,6 @@ def main(args = sys.argv): else: sys.stdout.write("UNORDERED\n") + if __name__ == "__main__": main() diff --git a/scripts/bam/mark_dups.py b/scripts/bam/mark_dups.py index d25c06e2..578dcd53 100755 --- a/scripts/bam/mark_dups.py +++ b/scripts/bam/mark_dups.py @@ -9,7 +9,6 @@ def parser_setup(): - script_options = { "infile": "/dev/stdin", "outfile": "/dev/stdout", @@ -18,13 +17,12 @@ def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("--hist", dest="histfile", - help="Write histogram of duplicates to this file") + parser.add_argument( + "--hist", dest="histfile", help="Write histogram of duplicates to this file" + ) - parser.add_argument("-i", "--infile", dest="infile", - help="Read from this file") - parser.add_argument("-o", "--outfile", dest="outfile", - help="Write to this file") + parser.add_argument("-i", "--infile", dest="infile", help="Read from this file") + parser.add_argument("-o", "--outfile", dest="outfile", help="Write to this file") parser.set_defaults(**script_options) parser.set_defaults(quiet=False, debug=False) @@ -68,8 +66,7 @@ def set_dup(read, is_dup): return read -class DupMarker(): - +class DupMarker: read_histo = defaultdict(int) pair_map = {} input = None @@ -138,9 +135,9 @@ def main(args=sys.argv): parser = parser_setup() poptions = parser.parse_args() - input = pysam.AlignmentFile(poptions.infile, 'r') - output = pysam.AlignmentFile(poptions.outfile, 'wb0', template=input) - histo = open(poptions.histfile, 'w') if poptions.histfile else None + input = pysam.AlignmentFile(poptions.infile, "r") + output = pysam.AlignmentFile(poptions.outfile, "wb0", template=input) + histo = open(poptions.histfile, "w") if poptions.histfile else None try: dupmarker = DupMarker(input=input, output=output, histo=histo) diff --git a/scripts/bam/move_umt_to_tag.py b/scripts/bam/move_umt_to_tag.py index 9bdf23e8..b19a0b7b 100755 --- a/scripts/bam/move_umt_to_tag.py +++ b/scripts/bam/move_umt_to_tag.py @@ -11,36 +11,39 @@ def move_umi(read, tag): - ''' + """ Looks for the UMI embeded in the read name, places it in a tag and trims the read name - ''' - umi_loc = read.query_name.find('#') + """ + umi_loc = read.query_name.find("#") if umi_loc > -1: - read.set_tag(tag, read.query_name[umi_loc+1:]) + read.set_tag(tag, read.query_name[umi_loc + 1 :]) read.query_name = read.query_name[:umi_loc] return read def main(): - """ Sets up parsing and runs the program """ + """Sets up parsing and runs the program""" # Parsing parser = argparse.ArgumentParser( prog="move_umt_to_tag", - description="Moves the UMT/UMI out of a read name and into a BAM tag") - parser.add_argument("input_alignment", - type=str, - help="Input alignment file (with UMT in name)") - parser.add_argument("output_alignment", - type=str, - help="Output alignment file (with UMT in tag)") - parser.add_argument("--tagname", type=str, default="RX", - help="Name of tag to store UMI info int") + description="Moves the UMT/UMI out of a read name and into a BAM tag", + ) + parser.add_argument( + "input_alignment", type=str, help="Input alignment file (with UMT in name)" + ) + parser.add_argument( + "output_alignment", type=str, help="Output alignment file (with UMT in tag)" + ) + parser.add_argument( + "--tagname", type=str, default="RX", help="Name of tag to store UMI info int" + ) args = parser.parse_args() input_alignment = pysam.AlignmentFile(args.input_alignment, "rb") - output_alignment = pysam.AlignmentFile(args.output_alignment, "wb", - template=input_alignment) + output_alignment = pysam.AlignmentFile( + args.output_alignment, "wb", template=input_alignment + ) reads = input_alignment.fetch(until_eof=True) # Do the work diff --git a/scripts/bam/random_reads.py b/scripts/bam/random_reads.py index 8bd20a03..1d02c038 100644 --- a/scripts/bam/random_reads.py +++ b/scripts/bam/random_reads.py @@ -1,5 +1,4 @@ def main(): - import argparse import random import pysam @@ -28,20 +27,22 @@ def main(): shutil.copyfile(infile, outfile) return - sorted_read_indexes = random.sample(range(paired_reads_count), paired_reads_count_to_select) + sorted_read_indexes = random.sample( + range(paired_reads_count), paired_reads_count_to_select + ) sorted_read_indexes.sort() - print('Selecting %d read pairs' % len(sorted_read_indexes)) + print("Selecting %d read pairs" % len(sorted_read_indexes)) if paired_reads_count_to_select > 100: - print('First 100 indexes to be selected', sorted_read_indexes[:100]) + print("First 100 indexes to be selected", sorted_read_indexes[:100]) else: - print('Indexes to be selected', sorted_read_indexes) + print("Indexes to be selected", sorted_read_indexes) # input pysam file - in_alignment_file = pysam.AlignmentFile(infile, 'rb') + in_alignment_file = pysam.AlignmentFile(infile, "rb") # output pysam file - out_alignment_file = pysam.AlignmentFile(outfile, 'wb', template=in_alignment_file) + out_alignment_file = pysam.AlignmentFile(outfile, "wb", template=in_alignment_file) # current index of the next paired read in the input file current_index_in_file = 0 # current index of random indexes to select @@ -92,5 +93,6 @@ def main(): out_alignment_file.close() in_alignment_file.close() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index 20add679..7d5c40fe 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -18,11 +18,13 @@ "process_config": "processing.json", "trackhub_config": None, "priority": None, - "api_url": os.getenv('LIMS_API_URL'), - "api_token": os.getenv('LIMS_API_TOKEN'), + "api_url": os.getenv("LIMS_API_URL"), + "api_token": os.getenv("LIMS_API_TOKEN"), } util_log = logging.getLogger("StamPy.util") + + def foldercheck(*args): """Checks to see if the folders exist, creates them if they are not.""" for folder in args: @@ -32,9 +34,12 @@ def foldercheck(*args): util_log.info("Created folder: %s" % folder) except OSError as x: util_log.error("ERROR: Could not create directory: %s" % folder) - util_log.warn("Please make sure all nonexistant parent directories have been created.") + util_log.warn( + "Please make sure all nonexistant parent directories have been created." + ) sys.exit(0) + def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, @@ -42,56 +47,96 @@ def mysql_clean(input): output = re.sub("[^\w$]", "_", input.strip()) return output + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - parser.add_argument("-j", "--json-config", dest="process_config", - help="The process config to work off of.") - parser.add_argument("-c", "--trackhub-config", dest="trackhub_config", - help="The trackhub config to work off of.") - parser.add_argument("-p", "--priority", dest="priority", required=True, - help="The priority of this flowcell") - parser.add_argument("--pre-align-dir", dest="pre_align_dir", action="store_true", - help="This flowcell was made before per-alignment directories") - parser.set_defaults( **options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + parser.add_argument( + "-j", + "--json-config", + dest="process_config", + help="The process config to work off of.", + ) + parser.add_argument( + "-c", + "--trackhub-config", + dest="trackhub_config", + help="The trackhub config to work off of.", + ) + parser.add_argument( + "-p", + "--priority", + dest="priority", + required=True, + help="The priority of this flowcell", + ) + parser.add_argument( + "--pre-align-dir", + dest="pre_align_dir", + action="store_true", + help="This flowcell was made before per-alignment directories", + ) + parser.set_defaults(**options) + parser.set_defaults(quiet=False, debug=False) return parser + class MakeBrowserload(object): genome_organisms = { - "hg19": "human", - "hg38": "human", - "rn5": "rat", - "mm9": "mouse", - "mm10": "mouse", - "TAIR9": "arabidopsis", - "sacCer2": "sacCer", - "sacCer3": "sacCer", - "ce4": "worm", - "cb3": "worm", - "K12": "e.coli", - "NC_000913.2": "e.coli", - "hera1": "butterfly", - "hmel1a": "butterfly", - "panu2a": "baboon", - "felCat5": "cat", - "borrBurg": "bacteria", - "danRer7": "zebrafish", + "hg19": "human", + "hg38": "human", + "rn5": "rat", + "mm9": "mouse", + "mm10": "mouse", + "TAIR9": "arabidopsis", + "sacCer2": "sacCer", + "sacCer3": "sacCer", + "ce4": "worm", + "cb3": "worm", + "K12": "e.coli", + "NC_000913.2": "e.coli", + "hera1": "butterfly", + "hmel1a": "butterfly", + "panu2a": "baboon", + "felCat5": "cat", + "borrBurg": "bacteria", + "danRer7": "zebrafish", } - rna_strands = [ "all", "pos", "neg" ] - - def __init__(self, group_data, trackhubconfig, basedir, outdir, mersize, priority, paired_end, project, label, date): - + rna_strands = ["all", "pos", "neg"] + + def __init__( + self, + group_data, + trackhubconfig, + basedir, + outdir, + mersize, + priority, + paired_end, + project, + label, + date, + ): self.basedir = basedir self.flowcell_date = date self.outdir = outdir self.mersize = mersize - self.win=75 - self.binI=20 + self.win = 75 + self.binI = 20 self.priority = priority self.paired_end = paired_end self.project = project @@ -108,25 +153,28 @@ def __init__(self, group_data, trackhubconfig, basedir, outdir, mersize, priorit logging.info("Using project dir: %s" % self.project_dirs[project]) else: for project in self.projects: - self.project_dirs[project] = os.path.join(self.basedir, "Project_" + project) + self.project_dirs[project] = os.path.join( + self.basedir, "Project_" + project + ) self.load_config(trackhubconfig) def load_config(self, trackhubconfig): - import configparser - Config = configparser.ConfigParser() - Config.read(trackhubconfig) - self.trackhubURL = Config.get('browser','trackhub_url') - self.flowcell_link_folder = Config.get('browser', 'flowcell_link_folder') + import configparser + + Config = configparser.ConfigParser() + Config.read(trackhubconfig) + self.trackhubURL = Config.get("browser", "trackhub_url") + self.flowcell_link_folder = Config.get("browser", "flowcell_link_folder") def load(self): self.basedir_name = os.path.basename(self.basedir) foldercheck(self.outdir) if False: - self.main_label = "%s%son%s" % (self.project, self.maintrackname, self.date) - self.flowcell_name = self.maintrackname - self.flowcell_date = self.date + self.main_label = "%s%son%s" % (self.project, self.maintrackname, self.date) + self.flowcell_name = self.maintrackname + self.flowcell_date = self.date else: match = re.search("(FC[A-Z0-9]+)_([0-9]{6})_tag", self.basedir) @@ -136,22 +184,31 @@ def load(self): self.flowcell_name = match.groups()[0] if not self.flowcell_date: - self.flowcell_date = match.groups()[1] + self.flowcell_date = match.groups()[1] logging.info("FLOWCELL DATE: %s" % self.flowcell_date) - self.main_label = "%s%son%s" % (self.project, self.flowcell_name, self.flowcell_date) + self.main_label = "%s%son%s" % ( + self.project, + self.flowcell_name, + self.flowcell_date, + ) logging.info("Main track name: %s" % self.main_label) self.excludes_file = os.path.join(self.outdir, "excludes.%s" % self.main_label) if self.flowcell_link_folder: - logging.debug("link folder: " + self.flowcell_link_folder + " base folder: " + self.basedir_name) + logging.debug( + "link folder: " + + self.flowcell_link_folder + + " base folder: " + + self.basedir_name + ) self.link_dir = os.path.join(self.flowcell_link_folder, self.basedir_name) else: self.link_dir = "" - + self.prepare_tracks() logging.info("Main label: %s" % self.main_label) @@ -163,18 +220,20 @@ def load(self): if key == "GRCh38_no_alts": self.subtrack_sets["hg38"] = self.subtrack_sets.pop("GRCh38_no_alts") if key == "GRCh38_no_alts_sequins": - self.subtrack_sets["hg38"] = self.subtrack_sets.pop("GRCh38_no_alts_sequins") + self.subtrack_sets["hg38"] = self.subtrack_sets.pop( + "GRCh38_no_alts_sequins" + ) self.create_ras() self.create_hubtxt() self.create_genomestxt() self.create_htmls() - # function for creating hub.txt + # function for creating hub.txt def create_hubtxt(self): hubfile = os.path.join(self.outdir, "hub.txt") logging.info("Creating hub.txt file: %s" % hubfile) - hub = open( hubfile, 'w') + hub = open(hubfile, "w") hub.write("hub %s\n" % self.flowcell_name) hub.write("shortLabel %s\n" % self.flowcell_name) hub.write("longLabel Tag sequencing, aligned %s\n" % (self.flowcell_date)) @@ -187,23 +246,23 @@ def create_hubtxt(self): def create_genomestxt(self): genomefile = os.path.join(self.outdir, "genomes.txt") logging.info("Creating genome.txt file: %s" % genomefile) - genomes = open( genomefile, 'w') + genomes = open(genomefile, "w") for hgdb, subtracks in self.subtrack_sets.items(): genomes.write("\ngenome %s\n" % hgdb) - genomes.write("trackDb %s/trackDb.%s.%s.txt\n" % (hgdb,self.project,self.main_label)) + genomes.write( + "trackDb %s/trackDb.%s.%s.txt\n" % (hgdb, self.project, self.main_label) + ) genomes.close() # splits tracks up and prepares them writing def prepare_tracks(self): - self.subtrack_sets = {} self.tracks = [] for lane in self.data: - logging.debug("preparing tracks for lane: " + str(lane)) if not "hgdb" in lane: - logging.error("Not using lane %s: no hgdb value" % lane ) + logging.error("Not using lane %s: no hgdb value" % lane) continue if lane["Index"] == "": @@ -225,10 +284,20 @@ def prepare_tracks(self): for track in self.tracks: hgdb = track["hgdb"] - - trackname_suffix = "L%s%s%s%sm%d" % (track["Lane"], track["Index"], track["SampleID"].lower(), track["strand"], self.mersize) - track["tagtrackname"] = mysql_clean("%stag%s" % (self.main_label, trackname_suffix)) - track["dentrackname"] = mysql_clean("%sden%s" % (self.main_label, trackname_suffix)) + + trackname_suffix = "L%s%s%s%sm%d" % ( + track["Lane"], + track["Index"], + track["SampleID"].lower(), + track["strand"], + self.mersize, + ) + track["tagtrackname"] = mysql_clean( + "%stag%s" % (self.main_label, trackname_suffix) + ) + track["dentrackname"] = mysql_clean( + "%sden%s" % (self.main_label, trackname_suffix) + ) logging.debug("tag track name: " + track["tagtrackname"]) logging.debug("den track name: " + track["dentrackname"]) @@ -236,33 +305,50 @@ def prepare_tracks(self): project = track["SampleProject"] if self.link_dir: - track["sampleDir"] = os.path.join("Project_%s" % project, - "Sample_%s" % track["SampleID"], - track["AlignDir"] if not poptions.pre_align_dir else "") + track["sampleDir"] = os.path.join( + "Project_%s" % project, + "Sample_%s" % track["SampleID"], + track["AlignDir"] if not poptions.pre_align_dir else "", + ) track["pathPrefix"] = "%s/%s" % (self.link_dir, track["sampleDir"]) else: - track["sampleDir"] = os.path.join(self.basedir, self.project_dir[project], - "Sample_%s" % track["SampleID"], - track["AlignDir"] if not poptions.pre_align_dir else "") + track["sampleDir"] = os.path.join( + self.basedir, + self.project_dir[project], + "Sample_%s" % track["SampleID"], + track["AlignDir"] if not poptions.pre_align_dir else "", + ) track["pathPrefix"] = track["sampleDir"] if track["aligner"] == "bwa": - track["wigfilename"] = "%s.75_20.%s.wig" % (track["SampleName"], hgdb) - track["bigwigfilename"] = "%s.75_20.%s.bw" % (track["SampleName"], hgdb) - track["bamfilename"] = "%s.uniques.sorted.bam" % (track["SampleName"]) + track["wigfilename"] = "%s.75_20.%s.wig" % (track["SampleName"], hgdb) + track["bigwigfilename"] = "%s.75_20.%s.bw" % (track["SampleName"], hgdb) + track["bamfilename"] = "%s.uniques.sorted.bam" % (track["SampleName"]) elif track["aligner"] == "tophat": - filename_prefix = "%s.%s.%s" % (track["SampleName"], track["strand"], hgdb) - track["wigfilename"] = "%s.wig" % filename_prefix # NYI - track["bigwigfilename"] = "%s.bw" % filename_prefix - track["bamfilename"] = "%s.bam" % filename_prefix + filename_prefix = "%s.%s.%s" % ( + track["SampleName"], + track["strand"], + hgdb, + ) + track["wigfilename"] = "%s.wig" % filename_prefix # NYI + track["bigwigfilename"] = "%s.bw" % filename_prefix + track["bamfilename"] = "%s.bam" % filename_prefix # TODO: Make the RNA pipeline aware of this # this is to deal with the mouse with human hg19 chr11 - if( hgdb == "hg19" and track["hgdb"] == "Mus_musculus" ): - track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % (track["SampleID"], track["Index"], track["Lane"]) - - if( hgdb == "hg19" and track["SampleRef"] == "Saccharomyces_cerevisiae" ): - track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % (track["SampleID"], track["Index"], track["Lane"]) + if hgdb == "hg19" and track["hgdb"] == "Mus_musculus": + track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % ( + track["SampleID"], + track["Index"], + track["Lane"], + ) + + if hgdb == "hg19" and track["SampleRef"] == "Saccharomyces_cerevisiae": + track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % ( + track["SampleID"], + track["Index"], + track["Lane"], + ) track["hasTags"] = False track["hasDensities"] = False @@ -270,9 +356,17 @@ def prepare_tracks(self): if "Extra" in track and track["Extra"] is not None: track["Extra"] = track["Extra"].strip() - if os.path.exists(os.path.join(track["sampleDir"], track["wigfilename"])) and not self.bigwig: + if ( + os.path.exists(os.path.join(track["sampleDir"], track["wigfilename"])) + and not self.bigwig + ): track["hasDensities"] = True - if os.path.exists(os.path.join(track["sampleDir"], track["bigwigfilename"])) and self.bigwig: + if ( + os.path.exists( + os.path.join(track["sampleDir"], track["bigwigfilename"]) + ) + and self.bigwig + ): track["hasDensities"] = True if os.path.exists(os.path.join(track["sampleDir"], track["bamfilename"])): track["hasTags"] = True @@ -280,14 +374,23 @@ def prepare_tracks(self): if not track["hasDensities"] or not track["hasTags"]: logging.error("%s does not have all files" % track["SampleID"]) if not track["hasDensities"]: - logging.error( "Missing densities" ) + logging.error("Missing densities") if self.bigwig: - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["bigwigfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["bigwigfilename"]) + ) else: - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["wigfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["wigfilename"]) + ) if not track["hasTags"]: logging.error("Missing tags") - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["bamfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["bamfilename"]) + ) logging.info("%s" % str(track)) if track["hasDensities"] or track["hasTags"]: @@ -297,10 +400,12 @@ def prepare_tracks(self): def create_htmls(self): self.html_files = {} masterhtmlloc = os.path.join(self.outdir, "description.html") - masterhtml = open(masterhtmlloc, 'w') + masterhtml = open(masterhtmlloc, "w") for hgdb, subtracks in self.subtrack_sets.items(): - self.html_files[hgdb] = os.path.join(self.outdir, hgdb, "%s.html" % self.main_label) - html = open( self.html_files[hgdb], 'w') + self.html_files[hgdb] = os.path.join( + self.outdir, hgdb, "%s.html" % self.main_label + ) + html = open(self.html_files[hgdb], "w") self.create_html(hgdb, html) self.create_html(hgdb, masterhtml) html.close() @@ -308,9 +413,23 @@ def create_htmls(self): # writes the genome HTML output to file (we don't really use these) def create_html(self, hgdb, file): - columns = ["Lane", "Index", "SampleID", "SampleRef", "CellType", "Assay", "Factors", "Extra", - "wellmapping", "wellmapping-no-mito", "SPOT"] - file.write("

Total number of lanes from this flowcell for genome %s: %d

\n" % (hgdb, len(self.subtrack_sets[hgdb]))) + columns = [ + "Lane", + "Index", + "SampleID", + "SampleRef", + "CellType", + "Assay", + "Factors", + "Extra", + "wellmapping", + "wellmapping-no-mito", + "SPOT", + ] + file.write( + "

Total number of lanes from this flowcell for genome %s: %d

\n" + % (hgdb, len(self.subtrack_sets[hgdb])) + ) file.write("\n") file.write("\n") file.write("\n") @@ -322,7 +441,7 @@ def create_html(self, hgdb, file): file.write("\n") file.write("\n") file.write("
\n") - + def create_ras(self): self.ra_files = {} @@ -336,16 +455,25 @@ def create_ra(self, hgdb): foldercheck(os.path.join(self.outdir, hgdb)) - self.ra_files[hgdb] = os.path.join(self.outdir, hgdb, "trackDb.%s.%s.txt" % (self.project, self.main_label)) - ra = open( self.ra_files[hgdb], 'w' ) + self.ra_files[hgdb] = os.path.join( + self.outdir, hgdb, "trackDb.%s.%s.txt" % (self.project, self.main_label) + ) + ra = open(self.ra_files[hgdb], "w") samples = set([subtrack["SampleID"] for subtrack in subtracks]) samples = dict() for subtrack in subtracks: if not subtrack["SampleID"] in subtrack: - samples[subtrack["SampleID"]] = "%s %s %s %s" % (subtrack["SampleID"], subtrack["CellType"], subtrack["Assay"], subtrack["Factors"]) - samples[subtrack["SampleID"]] = samples[subtrack["SampleID"]].strip().replace(" ", "_") + samples[subtrack["SampleID"]] = "%s %s %s %s" % ( + subtrack["SampleID"], + subtrack["CellType"], + subtrack["Assay"], + subtrack["Factors"], + ) + samples[subtrack["SampleID"]] = ( + samples[subtrack["SampleID"]].strip().replace(" ", "_") + ) ra.write("track %s\n" % self.main_label) ra.write("compositeTrack on\n") @@ -354,7 +482,12 @@ def create_ra(self, hgdb): ra.write("group %s\n" % self.project) ra.write("priority %s\n" % self.priority) ra.write("subGroup1 view Views TAG=Tags DEN=Density\n") - ra.write("subGroup2 sample Sample %s\n" % " ".join(sorted(['%s=%s' % (id, display) for id, display in samples.items()]))) + ra.write( + "subGroup2 sample Sample %s\n" + % " ".join( + sorted(["%s=%s" % (id, display) for id, display in samples.items()]) + ) + ) ra.write("dimensions dimensionX=view dimensionY=sample\n") ra.write("sortOrder view=+ sample=+\n") ra.write("dragAndDrop subTracks\n") @@ -369,26 +502,58 @@ def create_ra(self, hgdb): for subtrack in subtracks: if not "wellmapping-no-mito" in subtrack: - logging.warn("%s has no wellmapping-no-mito count" % subtrack["dentrackname"] ) + logging.warn( + "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] + ) subtrack["wellmapping-no-mito"] = "N/A" if not "wellmapping" in subtrack: - logging.warn("%s has no wellmapping count" % subtrack["dentrackname"] ) + logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" if not "SPOT" in subtrack: - logging.warn("%s has no SPOT score" % subtrack["dentrackname"] ) - subtrack["SPOT"] = "N/A"; + logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) + subtrack["SPOT"] = "N/A" for subtrack in subtracks: ra.write("\t\ttrack %s\n" % subtrack["tagtrackname"]) - ra.write("\t\tbigDataUrl %s%s/%s/%s\n" % (self.trackhubURL,self.label,subtrack["sampleDir"],subtrack["bamfilename"])) + ra.write( + "\t\tbigDataUrl %s%s/%s/%s\n" + % ( + self.trackhubURL, + self.label, + subtrack["sampleDir"], + subtrack["bamfilename"], + ) + ) ra.write("\t\tsubTrack %stag\n" % self.main_label) - ra.write("\t\tshortLabel %s %s:%s %s tags\n" % (subtrack["SampleID"], subtrack["Lane"], subtrack["Index"], subtrack["strand"])) + ra.write( + "\t\tshortLabel %s %s:%s %s tags\n" + % ( + subtrack["SampleID"], + subtrack["Lane"], + subtrack["Index"], + subtrack["strand"], + ) + ) ra.write("\t\tsubGroups view=TAG sample=%s\n" % subtrack["SampleID"]) ra.write("\t\tbamColorMode strand\n") - ra.write("\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" % ( - subtrack["CellType"], subtrack["SampleID"], self.flowcell_name, subtrack["Lane"], - subtrack["Index"], self.mersize, subtrack["Assay"], subtrack["Factors"], subtrack["Extra"], subtrack["strand"], subtrack["wellmapping"], - subtrack["wellmapping-no-mito"], subtrack["SPOT"])) + ra.write( + "\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" + % ( + subtrack["CellType"], + subtrack["SampleID"], + self.flowcell_name, + subtrack["Lane"], + subtrack["Index"], + self.mersize, + subtrack["Assay"], + subtrack["Factors"], + subtrack["Extra"], + subtrack["strand"], + subtrack["wellmapping"], + subtrack["wellmapping-no-mito"], + subtrack["SPOT"], + ) + ) if self.paired_end: ra.write("\t\tpairEndsByName .\n") ra.write("\t\ttype bam\n\n") @@ -406,20 +571,49 @@ def create_ra(self, hgdb): for subtrack in subtracks: ra.write("\t\ttrack %s\n" % subtrack["dentrackname"]) - ra.write("\t\tbigDataUrl %s%s/%s/%s\n" % (self.trackhubURL,self.label,subtrack["sampleDir"],subtrack["bigwigfilename"])) + ra.write( + "\t\tbigDataUrl %s%s/%s/%s\n" + % ( + self.trackhubURL, + self.label, + subtrack["sampleDir"], + subtrack["bigwigfilename"], + ) + ) ra.write("\t\tsubTrack %sden\n" % self.main_label) ra.write("\t\tsubGroups view=DEN sample=%s\n" % subtrack["SampleID"]) - ra.write("\t\tshortLabel %s %s:%s density\n" % (subtrack["SampleID"], subtrack["Lane"], subtrack["Index"],)) - ra.write("\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" % ( - subtrack["CellType"], subtrack["SampleID"], self.flowcell_name, subtrack["Lane"], - subtrack["Index"], self.mersize, subtrack["Assay"], subtrack["Factors"], subtrack["Extra"], subtrack["strand"], subtrack["wellmapping"], - subtrack["wellmapping-no-mito"], subtrack["SPOT"])) + ra.write( + "\t\tshortLabel %s %s:%s density\n" + % ( + subtrack["SampleID"], + subtrack["Lane"], + subtrack["Index"], + ) + ) + ra.write( + "\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" + % ( + subtrack["CellType"], + subtrack["SampleID"], + self.flowcell_name, + subtrack["Lane"], + subtrack["Index"], + self.mersize, + subtrack["Assay"], + subtrack["Factors"], + subtrack["Extra"], + subtrack["strand"], + subtrack["wellmapping"], + subtrack["wellmapping-no-mito"], + subtrack["SPOT"], + ) + ) ra.write("\t\tgroup %s\n" % self.project) if self.bigwig: ra.write("\t\ttype bigWig\n\n") else: ra.write("\t\ttype wig 1.00 10000\n\n") - + ra.close() def get_sample_dir(self, lane): @@ -427,6 +621,7 @@ def get_sample_dir(self, lane): project = lane["SampleProject"] return os.path.join(self.project_dir[project], "Sample_" + sample) + class LimsQuery(object): def __init__(self, api_url, api_token): self.api_url = api_url @@ -434,22 +629,24 @@ def __init__(self, api_url, api_token): self.cache = dict() self.cache[None] = None - self.count_types = set(['u-pf-n-mm2', 'u-pf-n-mm2-mito']) + self.count_types = set(["u-pf-n-mm2", "u-pf-n-mm2-mito"]) def get(self, query): - return self.get_by_url( "%s/%s" % ( self.api_url, query ) ) + return self.get_by_url("%s/%s" % (self.api_url, query)) def get_by_url(self, url): if not url in self.cache: - self.cache[url] = requests.get(url, headers={'Authorization': "Token %s" % self.api_token}).json() + self.cache[url] = requests.get( + url, headers={"Authorization": "Token %s" % self.api_token} + ).json() return self.cache[url] def get_all(self, query): data = self.get(query) - results = data['results'] - while data['next'] is not None: - data = self.get_by_url(data['next']) - results += data['results'] + results = data["results"] + while data["next"] is not None: + data = self.get_by_url(data["next"]) + results += data["results"] return results def get_counttype_by_codename(self, codename): @@ -458,24 +655,28 @@ def get_counttype_by_codename(self, codename): def get_counts_for_alignment(self, alignment): counts = dict() for type in self.count_types: - type_id = self.get_counttype_by_codename(type)['id'] - count_vals = self.get_all("flowcell_lane_count/?alignment=%s&count_type=%d" % (alignment, type_id)) + type_id = self.get_counttype_by_codename(type)["id"] + count_vals = self.get_all( + "flowcell_lane_count/?alignment=%s&count_type=%d" % (alignment, type_id) + ) if count_vals: - counts[type] = count_vals[0]['count'] + counts[type] = count_vals[0]["count"] # Check to see if we got all the types we wanted for count in self.count_types: if count not in counts: - logging.warn("Could not fetch count %s for alignment: %s" % (count, alignment)) - + logging.warn( + "Could not fetch count %s for alignment: %s" % (count, alignment) + ) + return counts def get_rna_metrics_for_alignment(self, alignment): results = self.get("rna_alignment_metrics/?alignment=%s" % alignment) - if not results['results']: + if not results["results"]: logging.warn("Could not fetch RNA metrics for alignment: %s" % alignment) return None - return results['results'][0] + return results["results"][0] def get_alignment(self, id): return self.get("flowcell_lane_alignment/%s/" % id) @@ -483,9 +684,10 @@ def get_alignment(self, id): def get_spot_for_alignment(self, alignment): # TODO: This assumes one spot per alignment. results = self.get("flowcell_lane_spot/?alignment=%s" % alignment) - if not results['results']: + if not results["results"]: return None - return results['results'][0] + return results["results"][0] + def get_alignment_data(library, alignment, lims): # This is mainly a shim. @@ -493,54 +695,61 @@ def get_alignment_data(library, alignment, lims): logging.debug("Fetching data for library: %s" % library) d = dict() - d['project'] = library['project'] - d['hgdb'] = alignment['genome_index'] - d['aligner'] = alignment['aligner'] - d['SampleName'] = alignment['sample_name'] - d['AlignDir'] = alignment['align_dir'] - d['Index'] = library['barcode_index'] - d['SampleID'] = library['samplesheet_name'] + d["project"] = library["project"] + d["hgdb"] = alignment["genome_index"] + d["aligner"] = alignment["aligner"] + d["SampleName"] = alignment["sample_name"] + d["AlignDir"] = alignment["align_dir"] + d["Index"] = library["barcode_index"] + d["SampleID"] = library["samplesheet_name"] # cell_type included for backwards compatibility with older processing files (before Feb 2016) - d['CellType'] = library.get('sample_taxonomy') or library.get('cell_type') - d['Assay'] = library['assay'] - d['Lane'] = library['lane'] - d['SampleProject'] = library['project'] + d["CellType"] = library.get("sample_taxonomy") or library.get("cell_type") + d["Assay"] = library["assay"] + d["Lane"] = library["lane"] + d["SampleProject"] = library["project"] - lims_lane = lims.get("flowcell_lane/%s" % library['id']) - lims_sample = lims.get_by_url( lims_lane['sample'] ) + lims_lane = lims.get("flowcell_lane/%s" % library["id"]) + lims_sample = lims.get_by_url(lims_lane["sample"]) - d['failed_lane'] = lims_lane['failed'] - if d['failed_lane']: - logging.warn("Lane marked as failed, not using: %s" % library['id']) + d["failed_lane"] = lims_lane["failed"] + if d["failed_lane"]: + logging.warn("Lane marked as failed, not using: %s" % library["id"]) return d - if d['aligner'] == 'bwa': - lims_counts = lims.get_counts_for_alignment(alignment['id']) - d['wellmapping'] = lims_counts.get('u-pf-n-mm2', None) - d['wellmapping-no-mito'] = lims_counts.get('u-pf-n-mm2-mito', None) + if d["aligner"] == "bwa": + lims_counts = lims.get_counts_for_alignment(alignment["id"]) + d["wellmapping"] = lims_counts.get("u-pf-n-mm2", None) + d["wellmapping-no-mito"] = lims_counts.get("u-pf-n-mm2-mito", None) # RNA doesn't have u-pf-no-mito counts, so we set those properties from the rna metrics - elif d['aligner'] == 'tophat': - r = lims.get_rna_metrics_for_alignment(alignment['id']) + elif d["aligner"] == "tophat": + r = lims.get_rna_metrics_for_alignment(alignment["id"]) if r is not None: # Subtract off ribosomal RNA - d['wellmapping'] = int(r['mapped_reads']) - d['wellmapping-no-mito'] = int(int(r['mapped_reads']) * (1 - (float(r['percent_chrM']) / 100.0))) + d["wellmapping"] = int(r["mapped_reads"]) + d["wellmapping-no-mito"] = int( + int(r["mapped_reads"]) * (1 - (float(r["percent_chrM"]) / 100.0)) + ) - d['Extra'] = lims_lane['extra'] - d['SampleRef'] = "" #NYI + d["Extra"] = lims_lane["extra"] + d["SampleRef"] = "" # NYI if lims_sample is not None: - d['Factors'] = ", ".join([ lims.get_by_url(factor)['display_name'] for factor in lims_sample['factors'] ]) + d["Factors"] = ", ".join( + [ + lims.get_by_url(factor)["display_name"] + for factor in lims_sample["factors"] + ] + ) else: - d['Factors'] = None + d["Factors"] = None - lims_spot = lims.get_spot_for_alignment(alignment['id']) - d['SPOT'] = lims_spot['spot_score'] if lims_spot else "N/A" + lims_spot = lims.get_spot_for_alignment(alignment["id"]) + d["SPOT"] = lims_spot["spot_score"] if lims_spot else "N/A" return d -def main(args = sys.argv): +def main(args=sys.argv): parser = parser_setup() global poptions poptions = parser.parse_args() @@ -553,49 +762,61 @@ def main(args = sys.argv): # Set up the logging levels logging.basicConfig(level=logging.INFO, format=log_format) - data = json.loads(open(poptions.process_config, 'r').read()) + data = json.loads(open(poptions.process_config, "r").read()) trackhubconfig = poptions.trackhub_config - projects = [ d['code_name'] for d in data['projects'] ] + projects = [d["code_name"] for d in data["projects"]] # get basedir - basedir = data['alignment_group']['directory'] - + basedir = data["alignment_group"]["directory"] + # for old flowcells where the LIMS hasn't been updated, update old file prefix to new prefix # (this might be fixed now?) - if re.search("monarch",basedir): + if re.search("monarch", basedir): fc_loc = "/net/seq/data/flowcells/" - fc_dirname = basedir.replace("/net/monarch/vol2/tag/stamlab/flowcells/","") - basedir = fc_loc+fc_dirname - + fc_dirname = basedir.replace("/net/monarch/vol2/tag/stamlab/flowcells/", "") + basedir = fc_loc + fc_dirname + # fetch paired endedness - paired_end = data['flowcell']['paired_end'] - label = data['alignment_group']['label'] - mersize = data['flowcell']['read_length'] - date = data['alignment_group']['label'].split('_')[1] + paired_end = data["flowcell"]["paired_end"] + label = data["alignment_group"]["label"] + mersize = data["flowcell"]["read_length"] + date = data["alignment_group"]["label"].split("_")[1] # get browsersheet information - lims = LimsQuery( poptions.api_url, poptions.api_token ) + lims = LimsQuery(poptions.api_url, poptions.api_token) load_groups = dict() - # find projects - for l in data['libraries']: - for a in l['alignments']: + # find projects + for l in data["libraries"]: + for a in l["alignments"]: align_data = get_alignment_data(l, a, lims) - if not align_data['failed_lane']: - p = align_data['project'] + if not align_data["failed_lane"]: + p = align_data["project"] if not p in load_groups: load_groups[p] = [] - load_groups[p].append( align_data ) - + load_groups[p].append(align_data) + for project in load_groups.keys(): lane_group = load_groups[project] logging.info("the basedirectory is: %s" % basedir) - outdir = os.path.join( basedir, "browser-load-%s" % project) - loader = MakeBrowserload(lane_group, trackhubconfig, basedir, outdir, mersize, poptions.priority, paired_end, project, label, date) + outdir = os.path.join(basedir, "browser-load-%s" % project) + loader = MakeBrowserload( + lane_group, + trackhubconfig, + basedir, + outdir, + mersize, + poptions.priority, + paired_end, + project, + label, + date, + ) loader.load() + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index 693fcef9..cc8591a3 100644 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -14,6 +14,7 @@ logging.getLogger("requests").setLevel(logging.WARNING) util_log = logging.getLogger("StamPy.util") + def foldercheck(*args): """Checks to see if the folders exist, creates them if they are not.""" for folder in args: @@ -23,9 +24,12 @@ def foldercheck(*args): util_log.info("Created folder: %s" % folder) except OSError as x: util_log.error("ERROR: Could not create directory: %s" % folder) - util_log.warn("Please make sure all nonexistant parent directories have been created.") + util_log.warn( + "Please make sure all nonexistant parent directories have been created." + ) sys.exit(0) + options = { "quiet": False, "debug": False, @@ -33,10 +37,11 @@ def foldercheck(*args): "trackhub_config": None, "projectname": None, "priority": None, - "base_api_url": os.getenv('LIMS_API_URL'), - "token": os.getenv('LIMS_API_TOKEN'), + "base_api_url": os.getenv("LIMS_API_URL"), + "token": os.getenv("LIMS_API_TOKEN"), } + def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, @@ -44,58 +49,98 @@ def mysql_clean(input): output = re.sub("[^\w$]", "_", input.strip()) return output + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - parser.add_argument("-j", "--json-config", dest="process_config", - help="The process config to work off of.") - parser.add_argument("-c", "--trackhub-config", dest="trackhub_config", - help="The trackhub config to work off of.") - parser.add_argument("-p", "--priority", dest="priority", required=True, - help="The priority of this project") - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - parser.add_argument("-n", "--project-name", dest="projectname", - help="Name of project. Required.") - parser.set_defaults( **options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + parser.add_argument( + "-j", + "--json-config", + dest="process_config", + help="The process config to work off of.", + ) + parser.add_argument( + "-c", + "--trackhub-config", + dest="trackhub_config", + help="The trackhub config to work off of.", + ) + parser.add_argument( + "-p", + "--priority", + dest="priority", + required=True, + help="The priority of this project", + ) + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + parser.add_argument( + "-n", "--project-name", dest="projectname", help="Name of project. Required." + ) + parser.set_defaults(**options) + parser.set_defaults(quiet=False, debug=False) return parser + ############ + class MakeBrowserLoad(object): genome_organisms = { - "hg19": "human", - "hg38": "human", - "rn5": "rat", - "mm9": "mouse", - "mm10": "mouse", - "TAIR9": "arabidopsis", - "sacCer2": "sacCer", - "sacCer3": "sacCer", - "ce4": "worm", - "cb3": "worm", - "K12": "e.coli", - "NC_000913.2": "e.coli", - "hera1": "butterfly", - "hmel1a": "butterfly", - "panu2a": "baboon", - "felCat5": "cat", - "borrBurg": "bacteria", - "danRer7": "zebrafish", + "hg19": "human", + "hg38": "human", + "rn5": "rat", + "mm9": "mouse", + "mm10": "mouse", + "TAIR9": "arabidopsis", + "sacCer2": "sacCer", + "sacCer3": "sacCer", + "ce4": "worm", + "cb3": "worm", + "K12": "e.coli", + "NC_000913.2": "e.coli", + "hera1": "butterfly", + "hmel1a": "butterfly", + "panu2a": "baboon", + "felCat5": "cat", + "borrBurg": "bacteria", + "danRer7": "zebrafish", } - rna_strands = [ "all", "pos", "neg" ] - - def __init__(self, group_data, trackhubconfig, priority, projectname, date, base_api_url, token): - - self.win=75 - self.binI=20 + rna_strands = ["all", "pos", "neg"] + + def __init__( + self, + group_data, + trackhubconfig, + priority, + projectname, + date, + base_api_url, + token, + ): + self.win = 75 + self.binI = 20 self.bigwig = True self.data = group_data @@ -109,13 +154,13 @@ def __init__(self, group_data, trackhubconfig, priority, projectname, date, base def load_config(self, trackhubconfig): import configparser + Config = configparser.ConfigParser() Config.read(trackhubconfig) - self.trackhubURL = Config.get('browser','trackhub_url') - self.aggregation_link_folder = Config.get('browser', 'aggregation_link_folder') + self.trackhubURL = Config.get("browser", "trackhub_url") + self.aggregation_link_folder = Config.get("browser", "aggregation_link_folder") def load(self): - # set up folder logging.info("Checking for trackhub folder: %s" % self.aggregation_link_folder) foldercheck(self.aggregation_link_folder) @@ -126,7 +171,7 @@ def load(self): self.prepare_tracks() self.prepare_genomes() - # write tracks and hub files + # write tracks and hub files self.create_ras() self.create_hubtxt() self.create_genometxt() @@ -134,10 +179,10 @@ def load(self): def create_hubtxt(self): hubfile = os.path.join(self.outdir, "hub.txt") logging.info("Creating hub.txt file: %s" % hubfile) - hub = open( hubfile, 'w') + hub = open(hubfile, "w") hub.write("hub %s\n" % self.projectname) hub.write("shortLabel %s\n" % self.projectname) - hub.write("longLabel %s, %s\n" % (self.projectname,self.date)) + hub.write("longLabel %s, %s\n" % (self.projectname, self.date)) hub.write("genomesFile genomes.txt\n") hub.write("email anishida@altius.org\n") hub.close() @@ -145,74 +190,99 @@ def create_hubtxt(self): def create_genometxt(self): genomefile = os.path.join(self.outdir, "genomes.txt") logging.info("Creating genome.txt file: %s" % genomefile) - genomes = open( genomefile, 'w') + genomes = open(genomefile, "w") for key in self.all_tracks: genomes.write("\ngenome %s\n" % key) - genomes.write("trackDb %s/trackDb.%s.txt\n" % (key,self.projectname)) + genomes.write("trackDb %s/trackDb.%s.txt\n" % (key, self.projectname)) genomes.close() def prepare_tracks(self): - self.all_tracks = {} for agg in self.data: - # skip aggregations that are not completed - if agg['needs_reprocessing'] == True: + if agg["needs_reprocessing"] == True: continue - if agg['processing_completed'] == None: + if agg["processing_completed"] == None: continue tracks = {} - logging.debug("Preparing tracks for AGG: %s" % agg['id']) - tracks['agg_id'] = agg['id'] - tracks['agg_ln'] = agg['library_name'] - tracks['agg_taxonomy'] = agg['taxonomy_name'] - tracks['agg_stat'] = "" - - if 'stats' in agg and agg['stats'] is not None: - if 'hotspot1-SPOT' in agg['stats']: - tracks['agg_stat'] = agg['stats']['hotspot1-SPOT'] - + logging.debug("Preparing tracks for AGG: %s" % agg["id"]) + tracks["agg_id"] = agg["id"] + tracks["agg_ln"] = agg["library_name"] + tracks["agg_taxonomy"] = agg["taxonomy_name"] + tracks["agg_stat"] = "" + + if "stats" in agg and agg["stats"] is not None: + if "hotspot1-SPOT" in agg["stats"]: + tracks["agg_stat"] = agg["stats"]["hotspot1-SPOT"] + # get genome name (not in json) - agg_genome_req = requests.get("%s/genome_index/%s" % (self.base_api_url,agg['genome_index_id']), - headers={'Authorization': "Token %s" % self.token}) + agg_genome_req = requests.get( + "%s/genome_index/%s" % (self.base_api_url, agg["genome_index_id"]), + headers={"Authorization": "Token %s" % self.token}, + ) if agg_genome_req.ok: agg_genome_result = agg_genome_req.json() # change sequins to normal - if agg_genome_result['label'] == "GRCh38_no_alts_sequins": - agg_genome_result['label'] = "GRCh38_no_alts" - tracks['agg_genome'] = agg_genome_result['label'] + if agg_genome_result["label"] == "GRCh38_no_alts_sequins": + agg_genome_result["label"] = "GRCh38_no_alts" + tracks["agg_genome"] = agg_genome_result["label"] # output expected is explicit for type of aggregation template used # dna - if agg['aggregation_process_template_id'] == 5 or agg['aggregation_process_template_id'] == 43 or agg['aggregation_process_template_id'] == 44: - - if 'normalized-density-bigwig-windowed' in agg['files'] and 'density-bigwig-windowed' in agg['files'] and 'all-alignments-bam' in agg['files'] and 'cutcounts-bw' in agg['files']: - tracks['dnase_normdens'] = agg['files']['normalized-density-bigwig-windowed'] - tracks['dnase_dens'] = agg['files']['density-bigwig-windowed'] - tracks['dnase_align'] = agg['files']['all-alignments-bam'] - tracks['dnase_cutconts'] = agg['files']['cutcounts-bw'] + if ( + agg["aggregation_process_template_id"] == 5 + or agg["aggregation_process_template_id"] == 43 + or agg["aggregation_process_template_id"] == 44 + ): + if ( + "normalized-density-bigwig-windowed" in agg["files"] + and "density-bigwig-windowed" in agg["files"] + and "all-alignments-bam" in agg["files"] + and "cutcounts-bw" in agg["files"] + ): + tracks["dnase_normdens"] = agg["files"][ + "normalized-density-bigwig-windowed" + ] + tracks["dnase_dens"] = agg["files"]["density-bigwig-windowed"] + tracks["dnase_align"] = agg["files"]["all-alignments-bam"] + tracks["dnase_cutconts"] = agg["files"]["cutcounts-bw"] else: - logging.info("Unable to locate AGG files for: %s" % (agg['id'])) + logging.info("Unable to locate AGG files for: %s" % (agg["id"])) # rna (processes are seperate for each genome) - elif agg['aggregation_process_template_id'] == 30 or agg['aggregation_process_template_id'] == 31 or agg['aggregation_process_template_id'] == 35 or agg['aggregation_process_template_id'] == 37: - if 'all-alignments-bam' in agg['files'] and 'neg-coverage-bigwig' in agg['files'] and 'pos-coverage-bigwig' in agg['files']: - tracks['rna_align'] = agg['files']['all-alignments-bam'] - tracks['rna_poscov'] = agg['files']['pos-coverage-bigwig'] - tracks['rna_negcov'] = agg['files']['neg-coverage-bigwig'] + elif ( + agg["aggregation_process_template_id"] == 30 + or agg["aggregation_process_template_id"] == 31 + or agg["aggregation_process_template_id"] == 35 + or agg["aggregation_process_template_id"] == 37 + ): + if ( + "all-alignments-bam" in agg["files"] + and "neg-coverage-bigwig" in agg["files"] + and "pos-coverage-bigwig" in agg["files"] + ): + tracks["rna_align"] = agg["files"]["all-alignments-bam"] + tracks["rna_poscov"] = agg["files"]["pos-coverage-bigwig"] + tracks["rna_negcov"] = agg["files"]["neg-coverage-bigwig"] else: - logging.info("Unable to locate AGG files for: %s" % (agg['id'])) + logging.info("Unable to locate AGG files for: %s" % (agg["id"])) # coverage across both strands still new, seperate from the rest for now - if 'all-coverage-bigwig' in agg['files']: - tracks['rna_bothcov'] = agg['files']['all-coverage-bigwig'] + if "all-coverage-bigwig" in agg["files"]: + tracks["rna_bothcov"] = agg["files"]["all-coverage-bigwig"] else: - logging.info("Unable to locate combined stranded AGG files for: %s" % (agg['id'])) + logging.info( + "Unable to locate combined stranded AGG files for: %s" + % (agg["id"]) + ) else: - logging.info("Unknown template type, %s, for %s" % (agg['aggregation_process_template_id'], agg['id'])) - if not tracks['agg_genome'] in self.all_tracks: - self.all_tracks[tracks['agg_genome']] = [] - self.all_tracks[tracks['agg_genome']].append(tracks) + logging.info( + "Unknown template type, %s, for %s" + % (agg["aggregation_process_template_id"], agg["id"]) + ) + if not tracks["agg_genome"] in self.all_tracks: + self.all_tracks[tracks["agg_genome"]] = [] + self.all_tracks[tracks["agg_genome"]].append(tracks) def prepare_genomes(self): # change genome names to match UCSC info @@ -228,19 +298,18 @@ def create_ras(self): for key in self.all_tracks: self.create_ra(key) - def create_ra(self,genome): - + def create_ra(self, genome): logging.info("Creating RA file for genome, %s" % genome) subtracks = self.all_tracks[genome] # collect unique path types per genome and samples - path_names={} + path_names = {} for agg in subtracks: for info_type in agg: - if re.match('dnase*',info_type) or re.match('rna*',info_type): - path_names[info_type] = 0 - all_samples = set(subtrack['agg_id'] for subtrack in subtracks) + if re.match("dnase*", info_type) or re.match("rna*", info_type): + path_names[info_type] = 0 + all_samples = set(subtrack["agg_id"] for subtrack in subtracks) # write output strings for views and paths view_string = "" @@ -255,10 +324,10 @@ def create_ra(self,genome): # create genome RA file ra_file = os.path.join(self.outdir, genome, "trackDb.%s.txt" % self.projectname) - ra = open(ra_file, 'w' ) + ra = open(ra_file, "w") # get sample list up front - all_samples = set(subtrack['agg_id'] for subtrack in subtracks) + all_samples = set(subtrack["agg_id"] for subtrack in subtracks) # write header ra.write("track %s\n" % self.projectname) @@ -277,9 +346,8 @@ def create_ra(self,genome): # for each path, for each agg for path in path_names: - logging.info("Writing %s to RA for %s", path, genome) - + # hardcoded display settings # change alignments to BAM files # change auto display settings for normalized densities and RNA pos/neg densities @@ -287,7 +355,12 @@ def create_ra(self,genome): visibility = "hide" if path == "dnase_align" or path == "rna_align": file_format = "bam" - if path == "dnase_normdens" or path == "rna_poscov" or path == "rna_negcov" or path == "rna_allcov": + if ( + path == "dnase_normdens" + or path == "rna_poscov" + or path == "rna_negcov" + or path == "rna_allcov" + ): visibility = "full\n\tviewLimits 0:5\n\tautoScale off\n\tmaxHeightPixels 100:32:16" # write path header @@ -300,13 +373,30 @@ def create_ra(self,genome): # write aggs for track in subtracks: if path in track: - friendly_path = re.sub('/net/seq/data/',self.trackhubURL,track[path]) - ra.write("\t\ttrack %s_%s_%s\n" % (self.projectname, path, track['agg_id'])) + friendly_path = re.sub( + "/net/seq/data/", self.trackhubURL, track[path] + ) + ra.write( + "\t\ttrack %s_%s_%s\n" + % (self.projectname, path, track["agg_id"]) + ) ra.write("\t\tbigDataUrl %s\n" % friendly_path) ra.write("\t\tsubTrack %s_%s\n" % (self.projectname, path)) - ra.write("\t\tshortLabel AG%s_%s_%s\n" % (track['agg_id'], self.projectname, path)) + ra.write( + "\t\tshortLabel AG%s_%s_%s\n" + % (track["agg_id"], self.projectname, path) + ) ra.write("\t\tsubGroups view=%s show=show\n" % path) - ra.write("\t\tlongLabel AG%s, %s, SPOT1 %s, %s, %s\n" % (track['agg_id'], track['agg_ln'], track['agg_stat'], track['agg_taxonomy'], path)) + ra.write( + "\t\tlongLabel AG%s, %s, SPOT1 %s, %s, %s\n" + % ( + track["agg_id"], + track["agg_ln"], + track["agg_stat"], + track["agg_taxonomy"], + path, + ) + ) ra.write("\t\tgroup %s\n" % self.projectname) if file_format == "bam": ra.write("\t\tpairEndsByName .\n") @@ -315,8 +405,7 @@ def create_ra(self,genome): ra.close() -def main(args = sys.argv): - +def main(args=sys.argv): parser = parser_setup() global poptions poptions = parser.parse_args() @@ -345,14 +434,23 @@ def main(args = sys.argv): logging.error("Could not find LIMS API TOKEN.\n") sys.exit(1) - data = json.loads(open(poptions.process_config, 'r').read()) - dataresults = data['results'] + data = json.loads(open(poptions.process_config, "r").read()) + dataresults = data["results"] dtime = datetime.datetime.now() - date = "%s-%s-%s" % (dtime.year,dtime.month,dtime.day) - - hubwriter = MakeBrowserLoad(dataresults, poptions.trackhub_config, poptions.priority, poptions.projectname, date, poptions.base_api_url, poptions.token) + date = "%s-%s-%s" % (dtime.year, dtime.month, dtime.day) + + hubwriter = MakeBrowserLoad( + dataresults, + poptions.trackhub_config, + poptions.priority, + poptions.projectname, + date, + poptions.base_api_url, + poptions.token, + ) hubwriter.load() + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index b63ab023..61594072 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -17,11 +17,13 @@ "debug": False, "process_config": "processing.json", "priority": None, - "api_url": os.getenv('LIMS_API_URL'), - "api_token": os.getenv('LIMS_API_TOKEN'), + "api_url": os.getenv("LIMS_API_URL"), + "api_token": os.getenv("LIMS_API_TOKEN"), } util_log = logging.getLogger("StamPy.util") + + def foldercheck(*args): """Checks to see if the folders exist, creates them if they are not.""" @@ -32,9 +34,12 @@ def foldercheck(*args): util_log.info("Created folder: %s" % folder) except OSError as x: util_log.error("ERROR: Could not create directory: %s" % folder) - util_log.warn("Please make sure all nonexistant parent directories have been created.") + util_log.warn( + "Please make sure all nonexistant parent directories have been created." + ) sys.exit(0) + def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, @@ -42,58 +47,92 @@ def mysql_clean(input): output = re.sub("[^\w$]", "_", input.strip()) return output + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-j", "--json-config", dest="process_config", - help="The process config to work off of.") - parser.add_argument("-p", "--priority", dest="priority", required=True, - help="The priority of this flowcell") - parser.add_argument("--pre-align-dir", dest="pre_align_dir", action="store_true", - help="This flowcell was made before per-alignment directories") - - parser.set_defaults( **options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-j", + "--json-config", + dest="process_config", + help="The process config to work off of.", + ) + parser.add_argument( + "-p", + "--priority", + dest="priority", + required=True, + help="The priority of this flowcell", + ) + parser.add_argument( + "--pre-align-dir", + dest="pre_align_dir", + action="store_true", + help="This flowcell was made before per-alignment directories", + ) + + parser.set_defaults(**options) + parser.set_defaults(quiet=False, debug=False) return parser + class MakeBrowserload(object): genome_organisms = { - "hg19": "human", - "rn5": "rat", - "mm9": "mouse", - "TAIR9": "arabidopsis", - "sacCer2": "sacCer", - "sacCer3": "sacCer", - "ce4": "worm", - "cb3": "worm", - "K12": "e.coli", - "NC_000913.2": "e.coli", - "hera1": "butterfly", - "hmel1a": "butterfly", - "panu2a": "baboon", - "felCat5": "cat", - "borrBurg": "bacteria", - "danRer7": "zebrafish", + "hg19": "human", + "rn5": "rat", + "mm9": "mouse", + "TAIR9": "arabidopsis", + "sacCer2": "sacCer", + "sacCer3": "sacCer", + "ce4": "worm", + "cb3": "worm", + "K12": "e.coli", + "NC_000913.2": "e.coli", + "hera1": "butterfly", + "hmel1a": "butterfly", + "panu2a": "baboon", + "felCat5": "cat", + "borrBurg": "bacteria", + "danRer7": "zebrafish", } - rna_strands = [ "all", "pos", "neg" ] - - #def __init__(self, browserconfig, browsersheet, basedir, outdir, priority, paired_end, project, project_dir = "", - #maintrackname = None, bigwig = True, date = None): - def __init__(self, group_data, browserconfig, basedir, outdir, priority, paired_end, project, date): - + rna_strands = ["all", "pos", "neg"] + + # def __init__(self, browserconfig, browsersheet, basedir, outdir, priority, paired_end, project, project_dir = "", + # maintrackname = None, bigwig = True, date = None): + def __init__( + self, + group_data, + browserconfig, + basedir, + outdir, + priority, + paired_end, + project, + date, + ): self.basedir = basedir self.flowcell_date = date self.outdir = outdir self.mersize = 36 - self.win=75 - self.binI=20 + self.win = 75 + self.binI = 20 self.priority = priority self.paired_end = paired_end self.bigwig = True @@ -108,32 +147,39 @@ def __init__(self, group_data, browserconfig, basedir, outdir, priority, paired_ logging.info("Using project dir: %s" % self.project_dirs[project]) else: for project in self.projects: - self.project_dirs[project] = os.path.join(self.basedir, "Project_" + project) + self.project_dirs[project] = os.path.join( + self.basedir, "Project_" + project + ) self.load_config(browserconfig) def load_config(self, browserconfig): - import configparser - Config = configparser.ConfigParser() - Config.read(browserconfig) - self.server = Config.get("browser", "server") - self.browser_url = Config.get("browser", "browser_url") - self.flowcell_link_folder = Config.get("browser", "flowcell_link_folder") - self.track_basedir = Config.get("browser", "track_basedir") - self.browser_excludes_file = Config.get("browser", "browser_excludes_file") - self.group = Config.get('browser', 'browser_group') - self.file_label = Config.get('browser', 'file_label') + import configparser + + Config = configparser.ConfigParser() + Config.read(browserconfig) + self.server = Config.get("browser", "server") + self.browser_url = Config.get("browser", "browser_url") + self.flowcell_link_folder = Config.get("browser", "flowcell_link_folder") + self.track_basedir = Config.get("browser", "track_basedir") + self.browser_excludes_file = Config.get("browser", "browser_excludes_file") + self.group = Config.get("browser", "browser_group") + self.file_label = Config.get("browser", "file_label") def load(self): - #self.browsersheet = SampleSheet(file=self.browsersheet_file) + # self.browsersheet = SampleSheet(file=self.browsersheet_file) self.basedir_name = os.path.basename(self.basedir) foldercheck(self.outdir) - #if self.maintrackname: + # if self.maintrackname: if False: - self.main_label = "%s%son%s" % (self.file_label, self.maintrackname, self.date) - self.flowcell_name = self.maintrackname - self.flowcell_date = self.date + self.main_label = "%s%son%s" % ( + self.file_label, + self.maintrackname, + self.date, + ) + self.flowcell_name = self.maintrackname + self.flowcell_date = self.date else: match = re.search("(FC[A-Z0-9]+)_([0-9]{6})_tag", self.basedir) @@ -143,18 +189,27 @@ def load(self): self.flowcell_name = match.groups()[0] if not self.flowcell_date: - self.flowcell_date = match.groups()[1] + self.flowcell_date = match.groups()[1] logging.info("FLOWCELL DATE: %s" % self.flowcell_date) - self.main_label = "%s%son%s" % (self.file_label, self.flowcell_name, self.flowcell_date) + self.main_label = "%s%son%s" % ( + self.file_label, + self.flowcell_name, + self.flowcell_date, + ) logging.info("Main track name: %s" % self.main_label) self.excludes_file = os.path.join(self.outdir, "excludes.%s" % self.main_label) if self.flowcell_link_folder: - logging.debug("link folder: " + self.flowcell_link_folder + " base folder: " + self.basedir_name) + logging.debug( + "link folder: " + + self.flowcell_link_folder + + " base folder: " + + self.basedir_name + ) self.link_dir = os.path.join(self.flowcell_link_folder, self.basedir_name) else: self.link_dir = "" @@ -174,10 +229,9 @@ def prepare_tracks(self): self.tracks = [] for lane in self.data: - logging.debug("preparing tracks for lane: " + str(lane)) if not "hgdb" in lane: - logging.error("Not using lane %s: no hgdb value" % lane ) + logging.error("Not using lane %s: no hgdb value" % lane) continue if lane["Index"] == "": @@ -200,9 +254,19 @@ def prepare_tracks(self): for track in self.tracks: hgdb = track["hgdb"] - trackname_suffix = "L%s%s%s%sm%d" % (track["Lane"], track["Index"], track["SampleID"].lower(), track["strand"], self.mersize) - track["tagtrackname"] = mysql_clean("%stag%s" % (self.main_label, trackname_suffix)) - track["dentrackname"] = mysql_clean("%sden%s" % (self.main_label, trackname_suffix)) + trackname_suffix = "L%s%s%s%sm%d" % ( + track["Lane"], + track["Index"], + track["SampleID"].lower(), + track["strand"], + self.mersize, + ) + track["tagtrackname"] = mysql_clean( + "%stag%s" % (self.main_label, trackname_suffix) + ) + track["dentrackname"] = mysql_clean( + "%sden%s" % (self.main_label, trackname_suffix) + ) logging.debug("tag track name: " + track["tagtrackname"]) logging.debug("den track name: " + track["dentrackname"]) @@ -210,34 +274,50 @@ def prepare_tracks(self): project = track["SampleProject"] if self.link_dir: - track["sampleDir"] = os.path.join("Project_%s" % project, - "Sample_%s" % track["SampleID"], - track["AlignDir"] if not poptions.pre_align_dir else "") + track["sampleDir"] = os.path.join( + "Project_%s" % project, + "Sample_%s" % track["SampleID"], + track["AlignDir"] if not poptions.pre_align_dir else "", + ) track["pathPrefix"] = "%s/%s" % (self.link_dir, track["sampleDir"]) else: - track["sampleDir"] = os.path.join(self.basedir, self.project_dir[project], - "Sample_%s" % track["SampleID"], - track["AlignDir"] if not poptions.pre_align_dir else "") + track["sampleDir"] = os.path.join( + self.basedir, + self.project_dir[project], + "Sample_%s" % track["SampleID"], + track["AlignDir"] if not poptions.pre_align_dir else "", + ) track["pathPrefix"] = track["sampleDir"] if track["aligner"] == "bwa": - track["wigfilename"] = "%s.75_20.%s.wig" % (track["SampleName"], hgdb) - track["bigwigfilename"] = "%s.75_20.%s.bw" % (track["SampleName"], hgdb) - track["bamfilename"] = "%s.uniques.sorted.bam" % (track["SampleName"]) + track["wigfilename"] = "%s.75_20.%s.wig" % (track["SampleName"], hgdb) + track["bigwigfilename"] = "%s.75_20.%s.bw" % (track["SampleName"], hgdb) + track["bamfilename"] = "%s.uniques.sorted.bam" % (track["SampleName"]) elif track["aligner"] == "tophat": - filename_prefix = "%s.%s.%s" % (track["SampleName"], track["strand"], hgdb) - track["wigfilename"] = "%s.wig" % filename_prefix # NYI - track["bigwigfilename"] = "%s.bw" % filename_prefix - track["bamfilename"] = "%s.bam" % filename_prefix - + filename_prefix = "%s.%s.%s" % ( + track["SampleName"], + track["strand"], + hgdb, + ) + track["wigfilename"] = "%s.wig" % filename_prefix # NYI + track["bigwigfilename"] = "%s.bw" % filename_prefix + track["bamfilename"] = "%s.bam" % filename_prefix # TODO: Make the RNA pipeline aware of this # this is to deal with the mouse with human hg19 chr11 - if( hgdb == "hg19" and track["hgdb"] == "Mus_musculus" ): - track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % (track["SampleID"], track["Index"], track["Lane"]) - - if( hgdb == "hg19" and track["SampleRef"] == "Saccharomyces_cerevisiae" ): - track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % (track["SampleID"], track["Index"], track["Lane"]) + if hgdb == "hg19" and track["hgdb"] == "Mus_musculus": + track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % ( + track["SampleID"], + track["Index"], + track["Lane"], + ) + + if hgdb == "hg19" and track["SampleRef"] == "Saccharomyces_cerevisiae": + track["bamfilename"] = "%s_%s_L00%s.uniques.sorted.hg19.bam" % ( + track["SampleID"], + track["Index"], + track["Lane"], + ) track["hasTags"] = False track["hasDensities"] = False @@ -245,9 +325,17 @@ def prepare_tracks(self): if "Extra" in track and track["Extra"] is not None: track["Extra"] = track["Extra"].strip() - if os.path.exists(os.path.join(track["sampleDir"], track["wigfilename"])) and not self.bigwig: + if ( + os.path.exists(os.path.join(track["sampleDir"], track["wigfilename"])) + and not self.bigwig + ): track["hasDensities"] = True - if os.path.exists(os.path.join(track["sampleDir"], track["bigwigfilename"])) and self.bigwig: + if ( + os.path.exists( + os.path.join(track["sampleDir"], track["bigwigfilename"]) + ) + and self.bigwig + ): track["hasDensities"] = True if os.path.exists(os.path.join(track["sampleDir"], track["bamfilename"])): track["hasTags"] = True @@ -255,14 +343,23 @@ def prepare_tracks(self): if not track["hasDensities"] or not track["hasTags"]: logging.error("%s does not have all files" % track["SampleID"]) if not track["hasDensities"]: - logging.error( "Missing densities" ) + logging.error("Missing densities") if self.bigwig: - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["bigwigfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["bigwigfilename"]) + ) else: - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["wigfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["wigfilename"]) + ) if not track["hasTags"]: logging.error("Missing tags") - logging.error("Wanted: " + os.path.join(track["sampleDir"], track["bamfilename"])) + logging.error( + "Wanted: " + + os.path.join(track["sampleDir"], track["bamfilename"]) + ) logging.info("%s" % str(track)) if track["hasDensities"] or track["hasTags"]: @@ -275,14 +372,30 @@ def create_htmls(self): self.create_html(hgdb) def create_html(self, hgdb): - self.html_files[hgdb] = os.path.join(self.outdir, hgdb, "%s.html" % self.main_label) - - html = open( self.html_files[hgdb], 'w') - - columns = ["Lane", "Index", "SampleID", "SampleRef", "CellType", "Assay", "Factors", "Extra", - "wellmapping", "wellmapping-no-mito", "SPOT"] - - html.write("

Total number of lanes from this flowcell for this genome: %d

\n" % len(self.subtrack_sets[hgdb])) + self.html_files[hgdb] = os.path.join( + self.outdir, hgdb, "%s.html" % self.main_label + ) + + html = open(self.html_files[hgdb], "w") + + columns = [ + "Lane", + "Index", + "SampleID", + "SampleRef", + "CellType", + "Assay", + "Factors", + "Extra", + "wellmapping", + "wellmapping-no-mito", + "SPOT", + ] + + html.write( + "

Total number of lanes from this flowcell for this genome: %d

\n" + % len(self.subtrack_sets[hgdb]) + ) html.write("\n") html.write("\n") @@ -309,26 +422,30 @@ def create_ras(self): def create_commands(self): makefile = os.path.join(self.outdir, "make.%s.doc" % self.main_label) logging.info("Makefile: %s" % makefile) - commands = open( makefile, 'w') + commands = open(makefile, "w") commands.write("# %s\n" % makefile) commands.write("# %s\n\n" % ", ".join(self.subtrack_sets.keys())) if self.link_dir: - commands.write(""" + commands.write( + """ if [ ! -e %(link_dir)s ]; then ln -s %(base_dir)s %(link_dir)s else echo %(link_dir)s already exists fi -\n""" % {"base_dir": self.basedir, "link_dir": self.link_dir}) +\n""" + % {"base_dir": self.basedir, "link_dir": self.link_dir} + ) for hgdb, subtracks in self.subtrack_sets.items(): - self.create_genome_commands( hgdb, commands) + self.create_genome_commands(hgdb, commands) # commands.write("\ncat %s >> %s\n" % (self.excludes_file, self.browser_excludes_file)) - commands.write(""" + commands.write( + """ for EXCLUDE_FILE in `cat %(excludes_file)s`; do if ! grep -q "$EXCLUDE_FILE" %(browser_excludes_file)s; then echo "$EXCLUDE_FILE" >> %(browser_excludes_file)s @@ -336,33 +453,60 @@ def create_commands(self): echo "$EXCLUDE_FILE already exists in %(browser_excludes_file)s" fi done -""" % {"excludes_file": self.excludes_file, "browser_excludes_file": self.browser_excludes_file}) +""" + % { + "excludes_file": self.excludes_file, + "browser_excludes_file": self.browser_excludes_file, + } + ) commands.close() def create_subtrack_commands(self, subtrack, commandsout): if subtrack["hasDensities"] and not self.bigwig: - commandsout.write("hgLoadWiggle -pathPrefix=%s %s %s %s/%s\n" % ( - subtrack["pathPrefix"], subtrack["hgdb"], subtrack["dentrackname"], subtrack["pathPrefix"], subtrack["wigfilename"])) + commandsout.write( + "hgLoadWiggle -pathPrefix=%s %s %s %s/%s\n" + % ( + subtrack["pathPrefix"], + subtrack["hgdb"], + subtrack["dentrackname"], + subtrack["pathPrefix"], + subtrack["wigfilename"], + ) + ) # hgLoadWiggle -pathPrefix=/usr/local/UW/flowcell-density/FCB0BLA_110620_tag/005 hg19 STAM_FCB0BLA_110620_IT_DEN_L005_6_DS18466_36_DNaseI /usr/local/UW/flowcell-density/FCB0BLA_110620_tag/005/FCB0BLA_lane6_75_20.wig if subtrack["hasDensities"] and self.bigwig: - commandsout.write("hgBbiDbLink %s %s %s/%s\n" % (subtrack["hgdb"], subtrack["dentrackname"], subtrack["pathPrefix"], subtrack["bigwigfilename"])) + commandsout.write( + "hgBbiDbLink %s %s %s/%s\n" + % ( + subtrack["hgdb"], + subtrack["dentrackname"], + subtrack["pathPrefix"], + subtrack["bigwigfilename"], + ) + ) if subtrack["hasTags"]: hgsqlcommand = "hgsql %s -e '" % subtrack["hgdb"] hgsqlcommand += "drop table if exists %s; " % subtrack["tagtrackname"] - hgsqlcommand += "create table %s (filename varchar(255) not null); " % subtrack["tagtrackname"] + hgsqlcommand += ( + "create table %s (filename varchar(255) not null); " + % subtrack["tagtrackname"] + ) hgsqlcommand += "insert into %s values " % subtrack["tagtrackname"] - hgsqlcommand += "(\"%s/%s\");'\n" % (subtrack["pathPrefix"], subtrack["bamfilename"]) + hgsqlcommand += '("%s/%s");\'\n' % ( + subtrack["pathPrefix"], + subtrack["bamfilename"], + ) commandsout.write(hgsqlcommand) -#ln -s $datafile bam-links/Rudensky/Rudensky_bams/$data.bam -#ln -s $indexfile bam-links/Rudensky/Rudensky_bams/$data.bam.bai -#hgsql $forg -e 'drop table if exists $trackType; create table -#$trackType (fileName varchar(255) not null); insert into $trackType -#values (\"/usr/local/UW/bam-links/Rudensky/Rudensky_bams/$data.bam\");'" + # ln -s $datafile bam-links/Rudensky/Rudensky_bams/$data.bam + # ln -s $indexfile bam-links/Rudensky/Rudensky_bams/$data.bam.bai + # hgsql $forg -e 'drop table if exists $trackType; create table + # $trackType (fileName varchar(255) not null); insert into $trackType + # values (\"/usr/local/UW/bam-links/Rudensky/Rudensky_bams/$data.bam\");'" def create_genome_commands(self, hgdb, commandsout): if not hgdb in self.genome_organisms: @@ -376,11 +520,23 @@ def create_genome_commands(self, hgdb, commandsout): for subtrack in self.subtrack_sets[hgdb]: self.create_subtrack_commands(subtrack, commandsout) - commandsout.write("\ncp %s %s/%s/%s\n" % (self.html_files[hgdb], self.track_basedir, organism, hgdb)) - commandsout.write("cp %s %s/%s/%s\n\n" % (self.ra_files[hgdb], self.track_basedir, organism, hgdb)) + commandsout.write( + "\ncp %s %s/%s/%s\n" + % (self.html_files[hgdb], self.track_basedir, organism, hgdb) + ) + commandsout.write( + "cp %s %s/%s/%s\n\n" + % (self.ra_files[hgdb], self.track_basedir, organism, hgdb) + ) include_name = "trackDb.%s.%s.ra" % (self.file_label, self.main_label) - include_file = "%s/%s/%s/trackDb.%s.ra" % (self.track_basedir, organism, hgdb, self.file_label) - commandsout.write(""" + include_file = "%s/%s/%s/trackDb.%s.ra" % ( + self.track_basedir, + organism, + hgdb, + self.file_label, + ) + commandsout.write( + """ if ! grep -q "include %(include_name)s" %(include_file)s then echo "Adding %(include_name)s to %(include_file)s" @@ -388,15 +544,17 @@ def create_genome_commands(self, hgdb, commandsout): else echo "%(include_name)s already in %(include_file)s" fi -""" % { "include_name": include_name, "include_file": include_file }) +""" + % {"include_name": include_name, "include_file": include_file} + ) # commandsout.write('# add line "include trackDb.%s.%s.ra" to %s/%s/%s/trackDb.%s.ra\n\n' % (self.file_label, self.main_label, self.track_basedir, organism, hgdb, self.file_label)) def create_excludes(self): - excludes = open( self.excludes_file, 'w') + excludes = open(self.excludes_file, "w") for subtrack in self.tracks: for suffix in ["frm", "MYD", "MYI"]: - logging.debug( "subtrack contents: " + str(subtrack)) + logging.debug("subtrack contents: " + str(subtrack)) excludes.write("%s.%s\n" % (subtrack["tagtrackname"], suffix)) excludes.write("%s.%s\n" % (subtrack["dentrackname"], suffix)) @@ -408,16 +566,25 @@ def create_ra(self, hgdb): foldercheck(os.path.join(self.outdir, hgdb)) - self.ra_files[hgdb] = os.path.join(self.outdir, hgdb, "trackDb.%s.%s.ra" % (self.file_label, self.main_label)) - ra = open( self.ra_files[hgdb], 'w' ) + self.ra_files[hgdb] = os.path.join( + self.outdir, hgdb, "trackDb.%s.%s.ra" % (self.file_label, self.main_label) + ) + ra = open(self.ra_files[hgdb], "w") samples = set([subtrack["SampleID"] for subtrack in subtracks]) samples = dict() for subtrack in subtracks: if not subtrack["SampleID"] in subtrack: - samples[subtrack["SampleID"]] = "%s %s %s %s" % (subtrack["SampleID"], subtrack["CellType"], subtrack["Assay"], subtrack["Factors"]) - samples[subtrack["SampleID"]] = samples[subtrack["SampleID"]].strip().replace(" ", "_") + samples[subtrack["SampleID"]] = "%s %s %s %s" % ( + subtrack["SampleID"], + subtrack["CellType"], + subtrack["Assay"], + subtrack["Factors"], + ) + samples[subtrack["SampleID"]] = ( + samples[subtrack["SampleID"]].strip().replace(" ", "_") + ) ra.write("track %s\n" % self.main_label) ra.write("compositeTrack on\n") @@ -426,7 +593,12 @@ def create_ra(self, hgdb): ra.write("group %s\n" % self.group) ra.write("priority %s\n" % self.priority) ra.write("subGroup1 view Views TAG=Tags DEN=Density\n") - ra.write("subGroup2 sample Sample %s\n" % " ".join(sorted(['%s=%s' % (id, display) for id, display in samples.items()]))) + ra.write( + "subGroup2 sample Sample %s\n" + % " ".join( + sorted(["%s=%s" % (id, display) for id, display in samples.items()]) + ) + ) ra.write("dimensions dimensionX=view dimensionY=sample\n") ra.write("sortOrder view=+ sample=+\n") ra.write("dragAndDrop subTracks\n") @@ -441,16 +613,18 @@ def create_ra(self, hgdb): for subtrack in subtracks: if not "wellmapping-no-mito" in subtrack: - logging.warn("%s has no wellmapping-no-mito count" % subtrack["dentrackname"] ) + logging.warn( + "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] + ) subtrack["wellmapping-no-mito"] = "N/A" if not "wellmapping" in subtrack: - logging.warn("%s has no wellmapping count" % subtrack["dentrackname"] ) + logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" if not "SPOT" in subtrack: - logging.warn("%s has no SPOT score" % subtrack["dentrackname"] ) - subtrack["SPOT"] = "N/A"; + logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) + subtrack["SPOT"] = "N/A" - #track STAM_FC630D3_110711_IT_TAG_L5_DS18900_36_ + # track STAM_FC630D3_110711_IT_TAG_L5_DS18900_36_ # subTrack STAM_FC630D3_110711_IT_TAG # subGroups view=TAG # shortLabel DS18900 5 tags @@ -461,23 +635,45 @@ def create_ra(self, hgdb): for subtrack in subtracks: ra.write("\t\ttrack %s\n" % subtrack["tagtrackname"]) ra.write("\t\tsubTrack %stag\n" % self.main_label) - ra.write("\t\tshortLabel %s %s:%s %s tags\n" % (subtrack["SampleID"], subtrack["Lane"], subtrack["Index"], subtrack["strand"])) + ra.write( + "\t\tshortLabel %s %s:%s %s tags\n" + % ( + subtrack["SampleID"], + subtrack["Lane"], + subtrack["Index"], + subtrack["strand"], + ) + ) ra.write("\t\tsubGroups view=TAG sample=%s\n" % subtrack["SampleID"]) ra.write("\t\tbamColorMode strand\n") - ra.write("\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" % ( - subtrack["CellType"], subtrack["SampleID"], self.flowcell_name, subtrack["Lane"], - subtrack["Index"], self.mersize, subtrack["Assay"], subtrack["Factors"], subtrack["Extra"], subtrack["strand"], subtrack["wellmapping"], - subtrack["wellmapping-no-mito"], subtrack["SPOT"])) + ra.write( + "\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" + % ( + subtrack["CellType"], + subtrack["SampleID"], + self.flowcell_name, + subtrack["Lane"], + subtrack["Index"], + self.mersize, + subtrack["Assay"], + subtrack["Factors"], + subtrack["Extra"], + subtrack["strand"], + subtrack["wellmapping"], + subtrack["wellmapping-no-mito"], + subtrack["SPOT"], + ) + ) if self.paired_end: ra.write("\t\tpairEndsByName .\n") ra.write("\t\ttype bam\n\n") logging.info("DEN SUBTRACK GROUP") - ra.write( "\ttrack %sden\n" % self.main_label) - ra.write( "\tsubTrack %s\n" % self.main_label) -# track STAM_FC630D3_110711_IT_DEN - # subTrack STAM_FC630D3_110711_IT + ra.write("\ttrack %sden\n" % self.main_label) + ra.write("\tsubTrack %s\n" % self.main_label) + # track STAM_FC630D3_110711_IT_DEN + # subTrack STAM_FC630D3_110711_IT ra.write("\tview DEN\n") ra.write("\tshortLabel Density\n") @@ -498,11 +694,32 @@ def create_ra(self, hgdb): ra.write("\t\ttrack %s\n" % subtrack["dentrackname"]) ra.write("\t\tsubTrack %sden\n" % self.main_label) ra.write("\t\tsubGroups view=DEN sample=%s\n" % subtrack["SampleID"]) - ra.write("\t\tshortLabel %s %s:%s density\n" % (subtrack["SampleID"], subtrack["Lane"], subtrack["Index"],)) - ra.write("\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" % ( - subtrack["CellType"], subtrack["SampleID"], self.flowcell_name, subtrack["Lane"], - subtrack["Index"], self.mersize, subtrack["Assay"], subtrack["Factors"], subtrack["Extra"], subtrack["strand"], subtrack["wellmapping"], - subtrack["wellmapping-no-mito"], subtrack["SPOT"])) + ra.write( + "\t\tshortLabel %s %s:%s density\n" + % ( + subtrack["SampleID"], + subtrack["Lane"], + subtrack["Index"], + ) + ) + ra.write( + "\t\tlongLabel %s %s %s %s:%s %dm %s %s %s %s tags: %s (%s), spot: %s\n" + % ( + subtrack["CellType"], + subtrack["SampleID"], + self.flowcell_name, + subtrack["Lane"], + subtrack["Index"], + self.mersize, + subtrack["Assay"], + subtrack["Factors"], + subtrack["Extra"], + subtrack["strand"], + subtrack["wellmapping"], + subtrack["wellmapping-no-mito"], + subtrack["SPOT"], + ) + ) ra.write("\t\tgroup %s\n" % self.group) if self.bigwig: ra.write("\t\ttype bigWig\n\n") @@ -525,23 +742,25 @@ def __init__(self, api_url, api_token): self.cache = dict() self.cache[None] = None - self.count_types = set(['u-pf-n-mm2', 'u-pf-n-mm2-mito']) + self.count_types = set(["u-pf-n-mm2", "u-pf-n-mm2-mito"]) def get(self, query): - return self.get_by_url( "%s/%s" % ( self.api_url, query ) ) + return self.get_by_url("%s/%s" % (self.api_url, query)) def get_by_url(self, url): if not url in self.cache: - #print url - self.cache[url] = requests.get(url, headers={'Authorization': "Token %s" % self.api_token}).json() + # print url + self.cache[url] = requests.get( + url, headers={"Authorization": "Token %s" % self.api_token} + ).json() return self.cache[url] def get_all(self, query): data = self.get(query) - results = data['results'] - while data['next'] is not None: - data = self.get_by_url(data['next']) - results += data['results'] + results = data["results"] + while data["next"] is not None: + data = self.get_by_url(data["next"]) + results += data["results"] return results def get_counttype_by_codename(self, codename): @@ -550,24 +769,28 @@ def get_counttype_by_codename(self, codename): def get_counts_for_alignment(self, alignment): counts = dict() for type in self.count_types: - type_id = self.get_counttype_by_codename(type)['id'] - count_vals = self.get_all("flowcell_lane_count/?alignment=%s&count_type=%d" % (alignment, type_id)) + type_id = self.get_counttype_by_codename(type)["id"] + count_vals = self.get_all( + "flowcell_lane_count/?alignment=%s&count_type=%d" % (alignment, type_id) + ) if count_vals: - counts[type] = count_vals[0]['count'] + counts[type] = count_vals[0]["count"] # Check to see if we got all the types we wanted for count in self.count_types: if count not in counts: - logging.warn("Could not fetch count %s for alignment: %s" % (count, alignment)) + logging.warn( + "Could not fetch count %s for alignment: %s" % (count, alignment) + ) return counts def get_rna_metrics_for_alignment(self, alignment): results = self.get("rna_alignment_metrics/?alignment=%s" % alignment) - if not results['results']: + if not results["results"]: logging.warn("Could not fetch RNA metrics for alignment: %s" % alignment) return None - return results['results'][0] + return results["results"][0] def get_alignment(self, id): return self.get("flowcell_lane_alignment/%s/" % id) @@ -575,9 +798,9 @@ def get_alignment(self, id): def get_spot_for_alignment(self, alignment): # TODO: This assumes one spot per alignment. results = self.get("flowcell_lane_spot/?alignment=%s" % alignment) - if not results['results']: + if not results["results"]: return None - return results['results'][0] + return results["results"][0] def get_alignment_data(library, alignment, lims): @@ -586,56 +809,63 @@ def get_alignment_data(library, alignment, lims): logging.debug("Fetching data for library: %s" % library) d = dict() - d['project'] = library['project'] - d['hgdb'] = alignment['genome_index'] - d['aligner'] = alignment['aligner'] - d['SampleName'] = alignment['sample_name'] - d['AlignDir'] = alignment['align_dir'] - d['Index'] = library['barcode_index'] - d['SampleID'] = library['samplesheet_name'] + d["project"] = library["project"] + d["hgdb"] = alignment["genome_index"] + d["aligner"] = alignment["aligner"] + d["SampleName"] = alignment["sample_name"] + d["AlignDir"] = alignment["align_dir"] + d["Index"] = library["barcode_index"] + d["SampleID"] = library["samplesheet_name"] # cell_type included for backwards compatibility with older processing files (before Feb 2016) - d['CellType'] = library.get('sample_taxonomy') or library.get('cell_type') - d['Assay'] = library['assay'] - d['Lane'] = library['lane'] - d['SampleProject'] = library['project'] - - lims_lane = lims.get("flowcell_lane/%s" % library['id']) - lims_sample = lims.get_by_url( lims_lane['sample'] ) - #lims_library = lims.get_by_url( lims_lane['library'] ) - - d['failed_lane'] = lims_lane['failed'] - if d['failed_lane']: - logging.warn("Lane marked as failed, not using: %s" % library['id']) + d["CellType"] = library.get("sample_taxonomy") or library.get("cell_type") + d["Assay"] = library["assay"] + d["Lane"] = library["lane"] + d["SampleProject"] = library["project"] + + lims_lane = lims.get("flowcell_lane/%s" % library["id"]) + lims_sample = lims.get_by_url(lims_lane["sample"]) + # lims_library = lims.get_by_url( lims_lane['library'] ) + + d["failed_lane"] = lims_lane["failed"] + if d["failed_lane"]: + logging.warn("Lane marked as failed, not using: %s" % library["id"]) return d - if d['aligner'] == 'bwa': - lims_counts = lims.get_counts_for_alignment(alignment['id']) - d['wellmapping'] = lims_counts.get('u-pf-n-mm2', None) - d['wellmapping-no-mito'] = lims_counts.get('u-pf-n-mm2-mito', None) + if d["aligner"] == "bwa": + lims_counts = lims.get_counts_for_alignment(alignment["id"]) + d["wellmapping"] = lims_counts.get("u-pf-n-mm2", None) + d["wellmapping-no-mito"] = lims_counts.get("u-pf-n-mm2-mito", None) # RNA doesn't have u-pf-no-mito counts # So we set those properties from the rna metrics - elif d['aligner'] == 'tophat': - r = lims.get_rna_metrics_for_alignment(alignment['id']) + elif d["aligner"] == "tophat": + r = lims.get_rna_metrics_for_alignment(alignment["id"]) if r is not None: # Subtract off ribosomal RNA - d['wellmapping'] = int(r['mapped_reads']) - d['wellmapping-no-mito'] = int(int(r['mapped_reads']) * (1 - (float(r['percent_chrM']) / 100.0))) + d["wellmapping"] = int(r["mapped_reads"]) + d["wellmapping-no-mito"] = int( + int(r["mapped_reads"]) * (1 - (float(r["percent_chrM"]) / 100.0)) + ) - d['Extra'] = lims_lane['extra'] - d['SampleRef'] = "" #NYI + d["Extra"] = lims_lane["extra"] + d["SampleRef"] = "" # NYI if lims_sample is not None: - d['Factors'] = ", ".join([ lims.get_by_url(factor)['display_name'] for factor in lims_sample['factors'] ]) + d["Factors"] = ", ".join( + [ + lims.get_by_url(factor)["display_name"] + for factor in lims_sample["factors"] + ] + ) else: - d['Factors'] = None + d["Factors"] = None - - lims_spot = lims.get_spot_for_alignment(alignment['id']) - d['SPOT'] = lims_spot['spot_score'] if lims_spot else "N/A" + lims_spot = lims.get_spot_for_alignment(alignment["id"]) + d["SPOT"] = lims_spot["spot_score"] if lims_spot else "N/A" return d -def main(args = sys.argv): + +def main(args=sys.argv): parser = parser_setup() global poptions poptions = parser.parse_args() @@ -648,53 +878,67 @@ def main(args = sys.argv): # Set up the logging levels logging.basicConfig(level=logging.INFO, format=log_format) - data = json.loads(open(poptions.process_config, 'r').read()) + data = json.loads(open(poptions.process_config, "r").read()) - projects = [ d['code_name'] for d in data['projects'] ] + projects = [d["code_name"] for d in data["projects"]] # get basedir - basedir = data['alignment_group']['directory'] + basedir = data["alignment_group"]["directory"] # Fetch paired endedness? - paired_end = data['flowcell']['paired_end'] - date = data['alignment_group']['label'].split('_')[1] + paired_end = data["flowcell"]["paired_end"] + date = data["alignment_group"]["label"].split("_")[1] # get browsersheet information! - lims = LimsQuery( poptions.api_url, poptions.api_token ) + lims = LimsQuery(poptions.api_url, poptions.api_token) browsers = set() load_groups = dict() - for l in data['libraries']: - for a in l['alignments']: - if a['browsers']: # Only process alignments that map to a browser + for l in data["libraries"]: + for a in l["alignments"]: + if a["browsers"]: # Only process alignments that map to a browser align_data = get_alignment_data(l, a, lims) - if not align_data['failed_lane']: - for b in a['browsers']: + if not align_data["failed_lane"]: + for b in a["browsers"]: browsers.add(b) - key = ( align_data['project'], b ) + key = (align_data["project"], b) if not key in load_groups: load_groups[key] = [] - load_groups[key].append( align_data ) - + load_groups[key].append(align_data) for group_key in load_groups.keys(): (project, browser) = group_key lane_group = load_groups[group_key] - browserconfig = os.path.join( os.getenv("STAMPIPES"), "config", "ucsc_browser", "%s-%s.config" % (browser, project) ) + browserconfig = os.path.join( + os.getenv("STAMPIPES"), + "config", + "ucsc_browser", + "%s-%s.config" % (browser, project), + ) if not os.path.isfile(browserconfig): - logging.error("No configuration file '%s' exists, don't know how to load project %s into browser %s" - % (browserconfig, project, browser )) + logging.error( + "No configuration file '%s' exists, don't know how to load project %s into browser %s" + % (browserconfig, project, browser) + ) sys.exit(1) - logging.info("Reading browser configuration from %s" % browserconfig) - outdir = os.path.join( basedir, "browser-load-%s-%s" % (project, browser)) - - loader = MakeBrowserload(lane_group, browserconfig, basedir, outdir, poptions.priority, paired_end, project, date) + outdir = os.path.join(basedir, "browser-load-%s-%s" % (project, browser)) + + loader = MakeBrowserload( + lane_group, + browserconfig, + basedir, + outdir, + poptions.priority, + paired_end, + project, + date, + ) loader.load() diff --git a/scripts/browser/parse_all_projects.py b/scripts/browser/parse_all_projects.py index 0479faa0..d7cf1378 100644 --- a/scripts/browser/parse_all_projects.py +++ b/scripts/browser/parse_all_projects.py @@ -21,28 +21,37 @@ "outfile": "project_list.txt", } + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - parser.add_argument("-o", "--outfile", dest="outfile", - help="The outfile to save to.") - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + parser.add_argument( + "-o", "--outfile", dest="outfile", help="The outfile to save to." + ) + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -def get_projects(api_url, token, outfile): - info = requests.get("%s/project/?page_size=1000" % (api_url), - headers={'Authorization': "Token %s" % token}) +def get_projects(api_url, token, outfile): + info = requests.get( + "%s/project/?page_size=1000" % (api_url), + headers={"Authorization": "Token %s" % token}, + ) if info.ok: result = info.json() - out = open(outfile, 'w') - for proj in result['results']: - outstring = "%s\t%s\n" % (proj['id'], proj['slug']) + out = open(outfile, "w") + for proj in result["results"]: + outstring = "%s\t%s\n" % (proj["id"], proj["slug"]) out.write(outstring) else: @@ -50,9 +59,10 @@ def get_projects(api_url, token, outfile): return -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -90,5 +100,3 @@ def main(args = sys.argv): # without automatically running it if __name__ == "__main__": main() - - diff --git a/scripts/bwa/aggregate/basic/sparse_motifs.py b/scripts/bwa/aggregate/basic/sparse_motifs.py index 8a9b9c6d..a7da3741 100644 --- a/scripts/bwa/aggregate/basic/sparse_motifs.py +++ b/scripts/bwa/aggregate/basic/sparse_motifs.py @@ -28,13 +28,13 @@ else: newrow = [0] * len(fimos) rows.append(newrow) -labels = range(0,len(hotspot_names)) +labels = range(0, len(hotspot_names)) # write dump_svmlight_file(rows, y=labels, f=outfile, zero_based=True) -file1 = open(outfile_rows, 'w') +file1 = open(outfile_rows, "w") for item in hotspot_names: file1.write("%s\n" % item) -file2 = open(outfile_cols,'w') +file2 = open(outfile_cols, "w") for item in fimos: file2.write("%s\n" % item) diff --git a/scripts/bwa/bamcounts.py b/scripts/bwa/bamcounts.py index 6661e4d0..0260dd9a 100644 --- a/scripts/bwa/bamcounts.py +++ b/scripts/bwa/bamcounts.py @@ -15,42 +15,60 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" script_options = { - "debug": False, - "quiet": True, + "debug": False, + "quiet": True, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() parser.add_argument("bamfile", help="The BAM file to make counts on.") - parser.add_argument("outfile", - help="The file to write the counts to.") - - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("--min_mapping_quality", dest="min_mapping_quality", type=int, default=10, - help="Minimum mapping quality for filtering.") - parser.add_argument("--max_mismatches", dest="max_mismatches", type=int, default=2, - help="Maximum mismatches for filtering") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument("outfile", help="The file to write the counts to.") + + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "--min_mapping_quality", + dest="min_mapping_quality", + type=int, + default=10, + help="Minimum mapping quality for filtering.", + ) + parser.add_argument( + "--max_mismatches", + dest="max_mismatches", + type=int, + default=2, + help="Maximum mismatches for filtering", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -class BAMFilter(object): +class BAMFilter(object): def __init__(self, max_mismatches=2, min_mapping_quality=10): - self.max_mismatches = max_mismatches self.previous_read = None self.min_mapping_quality = min_mapping_quality - self.upfnmm = 'u-pf-n-mm%d' % self.max_mismatches - self.upfnmmmito = 'u-pf-n-mm%d-mito' % self.max_mismatches + self.upfnmm = "u-pf-n-mm%d" % self.max_mismatches + self.upfnmmmito = "u-pf-n-mm%d-mito" % self.max_mismatches def process_flags(self, read, inbam): """ @@ -85,41 +103,40 @@ def process_read_paired(self, read, inbam): # Figure out how many alignments aren't included because of mapq if self.min_mapping_quality > read.mapq: - self.counts['paired-mapq-filter'] += 1 + self.counts["paired-mapq-filter"] += 1 return False # do not use reads with QC fail even if they pass all other checks # 0x512 QC Fail if read.flag & 512: - self.counts['paired-aligned-qcfail'] += 1 + self.counts["paired-aligned-qcfail"] += 1 return False - self.counts['paired-aligned'] += 1 + self.counts["paired-aligned"] += 1 return True def process_read(self, read, inbam): - self.process_flags(read, inbam) # This might not be the most perfect indicator, but it will do for now # Must take place before minimum quality filter--most multiple matching # reads have mapq set to 0 if read.has_tag("XT") and read.get_tag("XT") == "R": - self.counts['mm'] += 1 + self.counts["mm"] += 1 if read.is_qcfail: - self.counts['qc-flagged'] += 1 + self.counts["qc-flagged"] += 1 if read.is_unmapped: - self.counts['nm'] += 1 + self.counts["nm"] += 1 return False else: - self.counts['all-aligned'] += 1 + self.counts["all-aligned"] += 1 # Figure out how many alignments aren't included because of mapq if self.min_mapping_quality > read.mapq: - self.counts['all-mapq-filter'] += 1 + self.counts["all-mapq-filter"] += 1 return chr = inbam.getrname(read.rname) @@ -131,21 +148,21 @@ def process_read(self, read, inbam): autosomal = nuclear and chr not in ("chrX", "chrY", "chrZ", "chrW") if nuclear: - self.counts['nuclear-align'] += 1 + self.counts["nuclear-align"] += 1 if autosomal: - self.counts['autosomal-align'] += 1 + self.counts["autosomal-align"] += 1 if read.is_paired: - self.counts['paired-nuclear-align'] += 1 + self.counts["paired-nuclear-align"] += 1 if autosomal: - self.counts['paired-autosomal-align'] += 1 + self.counts["paired-autosomal-align"] += 1 if read.flag & 1024: - self.counts['duplicate'] += 1 + self.counts["duplicate"] += 1 if nuclear: - self.counts['duplicate-nuclear'] += 1 + self.counts["duplicate-nuclear"] += 1 - self.counts['u'] += 1 + self.counts["u"] += 1 passreadlength = "aligned-readlength-%d" % read.rlen self.readlengthcounts[passreadlength] += 1 @@ -153,42 +170,51 @@ def process_read(self, read, inbam): if read.is_qcfail: return False - self.counts['u-pf'] += 1 + self.counts["u-pf"] += 1 if "N" in read.seq: return False else: - self.counts['u-pf-n'] += 1 + self.counts["u-pf-n"] += 1 if read.has_tag("NM") and read.get_tag("NM") > self.max_mismatches: return False else: - self.counts['u-pf-n-mm%d' % self.max_mismatches] += 1 + self.counts["u-pf-n-mm%d" % self.max_mismatches] += 1 self.chrcounts[chr] += 1 if not "chrM" == chr: - self.counts['u-pf-n-mm%d-mito' % self.max_mismatches] += 1 - + self.counts["u-pf-n-mm%d-mito" % self.max_mismatches] += 1 return True def write_dict(self, countout, counts): - for count in sorted(counts.keys()): countout.write("%s\t%d\n" % (count, counts[count])) def filter(self, infile, countfile): - - inbam = Samfile(infile, 'rb') - - count_labels = ['u', 'u-pf', 'u-pf-n', 'u-pf-n-mm%d' % self.max_mismatches, - 'u-pf-n-mm%d-mito' % self.max_mismatches, 'mm', 'nm', - 'qc-flagged', 'duplicate', 'duplicate-nuclear', - 'nuclear-align', 'autosomal-align', - 'paired-aligned', 'paired-nuclear-align', - 'paired-autosomal-align', 'all-aligned', - 'all-mapq-filter'] + inbam = Samfile(infile, "rb") + + count_labels = [ + "u", + "u-pf", + "u-pf-n", + "u-pf-n-mm%d" % self.max_mismatches, + "u-pf-n-mm%d-mito" % self.max_mismatches, + "mm", + "nm", + "qc-flagged", + "duplicate", + "duplicate-nuclear", + "nuclear-align", + "autosomal-align", + "paired-aligned", + "paired-nuclear-align", + "paired-autosomal-align", + "all-aligned", + "all-mapq-filter", + ] logging.debug(count_labels) self.counts = dict([(label, 0) for label in count_labels]) @@ -201,7 +227,7 @@ def filter(self, infile, countfile): for read in inbam: self.process_read(read, inbam) - countout = open(countfile, 'a') + countout = open(countfile, "a") self.write_dict(countout, self.counts) self.write_dict(countout, self.chrcounts) @@ -212,9 +238,9 @@ def filter(self, infile, countfile): countout.close() -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() args = parser.parse_args() @@ -230,9 +256,12 @@ def main(args = sys.argv): bamfile = args.bamfile countfile = args.outfile - filter = BAMFilter(max_mismatches=args.max_mismatches, min_mapping_quality=args.min_mapping_quality) + filter = BAMFilter( + max_mismatches=args.max_mismatches, min_mapping_quality=args.min_mapping_quality + ) filter.filter(bamfile, countfile) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/bwa/filter_reads.py b/scripts/bwa/filter_reads.py index 57b91b85..1c491dd5 100755 --- a/scripts/bwa/filter_reads.py +++ b/scripts/bwa/filter_reads.py @@ -15,83 +15,130 @@ import pysam import re -''' +""" Exception when a bad read is found -''' +""" + + class read_exception(Exception): pass -''' + +""" Looks for the UMI embeded in the read name, places it in a tag and trims the read name -''' -def parse_umi(read): +""" + +def parse_umi(read): try: - umi_loc = read.query_name.index('#') + umi_loc = read.query_name.index("#") except: pass else: - read.set_tag("XD", read.query_name[umi_loc+1:]) + read.set_tag("XD", read.query_name[umi_loc + 1 :]) read.query_name = read.query_name[:umi_loc] return read -''' + +""" General function to set the flag field -''' -def set_read_flag(read, flag, mark): +""" + +def set_read_flag(read, flag, mark): if mark: - read.flag |= (1< max_mismatches: raise read_exception("Read mismatches > %d" % max_mismatches) +def validate_read(read, min_mapq=1, max_mismatches=2): + if read.mapping_quality < min_mapq: + raise read_exception("Read MAPQ < %d" % min_mapq) + if read.is_unmapped: + raise read_exception("Read not mapped") + if read.get_tag("NM") > max_mismatches: + raise read_exception("Read mismatches > %d" % max_mismatches) return read + import argparse -parser = argparse.ArgumentParser(prog = "filter_reads", description = "manual corrects the flags in a single- or pair-end BAM alignment file") -parser.add_argument("raw_alignment", type = str, help = "Inupt raw alignment file (must be sorted by name") -parser.add_argument("filtered_alignment", type = str, help = "Output filtered alignment file (sorted by name)") -parser.add_argument("nuclear_chr", type = str, help = "List of nuclear chromosomes to use") -parser.add_argument("--min_mapq", action = "store", type = int, default = 10, help = "Reads must have at least this MAPQ to pass filter [%(default)s]") -parser.add_argument("--max_mismatches", action = "store", type = int, default = 2, help = "Maximum mismatches to pass filter [%(default)s]") -parser.add_argument("--max_insert_size", action = "store", type = int, default = 750, help = "Maximum insert size to pass filter [%(default)s]") -parser.add_argument("--verbosity", action = "store", type = int, default = 50, help = "Verbosity (50 = quiet, 0 = loud) [%(default)s]") +parser = argparse.ArgumentParser( + prog="filter_reads", + description="manual corrects the flags in a single- or pair-end BAM alignment file", +) +parser.add_argument( + "raw_alignment", type=str, help="Inupt raw alignment file (must be sorted by name" +) +parser.add_argument( + "filtered_alignment", + type=str, + help="Output filtered alignment file (sorted by name)", +) +parser.add_argument("nuclear_chr", type=str, help="List of nuclear chromosomes to use") +parser.add_argument( + "--min_mapq", + action="store", + type=int, + default=10, + help="Reads must have at least this MAPQ to pass filter [%(default)s]", +) +parser.add_argument( + "--max_mismatches", + action="store", + type=int, + default=2, + help="Maximum mismatches to pass filter [%(default)s]", +) +parser.add_argument( + "--max_insert_size", + action="store", + type=int, + default=750, + help="Maximum insert size to pass filter [%(default)s]", +) +parser.add_argument( + "--verbosity", + action="store", + type=int, + default=50, + help="Verbosity (50 = quiet, 0 = loud) [%(default)s]", +) args = parser.parse_args() -logging.basicConfig(stream = sys.stdout, level = args.verbosity) +logging.basicConfig(stream=sys.stdout, level=args.verbosity) raw_alignment = pysam.AlignmentFile(args.raw_alignment, "rb") -filtered_alignment = pysam.AlignmentFile(args.filtered_alignment, "wbu", template = raw_alignment) -nuclear_chrs = [line.rstrip('\n') for line in open(args.nuclear_chr)] +filtered_alignment = pysam.AlignmentFile( + args.filtered_alignment, "wbu", template=raw_alignment +) +nuclear_chrs = [line.rstrip("\n") for line in open(args.nuclear_chr)] -raw_reads = raw_alignment.fetch(until_eof = True) +raw_reads = raw_alignment.fetch(until_eof=True) read1 = None read2 = None @@ -99,8 +146,7 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): qc_fail = False proper_pair = False -while(1): - +while 1: try: if not read1: read1 = parse_umi(next(raw_reads)) @@ -114,8 +160,11 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): # Continue in pair-end mode if their is two reads that are paired and that they have the same name - if (read1 and read2) and (read1.is_paired and read2.is_paired) and (read1.query_name == read2.query_name): - + if ( + (read1 and read2) + and (read1.is_paired and read2.is_paired) + and (read1.query_name == read2.query_name) + ): (read1, read2) = (read1, read2) if read1.is_read1 else (read2, read1) try: @@ -146,11 +195,13 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): # Insert sizes must less than the maximum - if abs(read1.template_length) > args.max_insert_size or read2.template_length > args.max_insert_size: + if ( + abs(read1.template_length) > args.max_insert_size + or read2.template_length > args.max_insert_size + ): raise read_exception("Insert size > %d!" % args.max_insert_size) except read_exception as e: - # If we get a read exception, then set # QC fail flag, and unset proper pair flag @@ -162,7 +213,6 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): logging.debug(read2) else: - # No exception, then unset # QC fail flag, and unset proper pair flag @@ -170,12 +220,11 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): proper_pair = True finally: - # Set the flags set_qc_fail(read1, qc_fail) set_proper_pair(read1, proper_pair) - + if read1.reference_id != -1 and not read1.reference_name in nuclear_chrs: set_nonnuclear(read1, True) @@ -191,37 +240,29 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): (read1, read2) = (None, None) - #Failed pair-end test -- could be single-end + # Failed pair-end test -- could be single-end else: - try: - validate_read(read1, args.min_mapq, args.max_mismatches) if read1.is_paired: - if not read1.mate_is_unmapped: - raise read_exception("No mate found (incongruent flag)!") else: - raise read_exception("No mate found!") except read_exception as e: - qc_fail = True logging.debug(e) logging.debug(read1) else: - qc_fail = False finally: - set_qc_fail(read1, qc_fail) set_proper_pair(read1, False) @@ -235,4 +276,3 @@ def validate_read(read, min_mapq = 1, max_mismatches = 2): # clean-up and close files raw_alignment.close() filtered_alignment.close() - diff --git a/scripts/bwa/fix_bam_pairing.py b/scripts/bwa/fix_bam_pairing.py index eb83d1aa..8fae86f3 100755 --- a/scripts/bwa/fix_bam_pairing.py +++ b/scripts/bwa/fix_bam_pairing.py @@ -5,87 +5,89 @@ import argparse parser = argparse.ArgumentParser(description="Set read pair status") -parser.add_argument('infile', metavar='IN_BAM', type=str, - help="The BAM file to read from. If '-', read from STDIN") -parser.add_argument('outfile', metavar='OUT_BAM', type=str, - help="The BAM file to write to. If '-', write to STDOUT") -parser.add_argument('--umi', action="store_true", - help="Extract and set UMI adapter information") +parser.add_argument( + "infile", + metavar="IN_BAM", + type=str, + help="The BAM file to read from. If '-', read from STDIN", +) +parser.add_argument( + "outfile", + metavar="OUT_BAM", + type=str, + help="The BAM file to write to. If '-', write to STDOUT", +) +parser.add_argument( + "--umi", action="store_true", help="Extract and set UMI adapter information" +) poptions = parser.parse_args() unfiltered_reads = pysam.Samfile(poptions.infile, "rb") -filtered_reads = pysam.Samfile(poptions.outfile, "wbu", template = unfiltered_reads) +filtered_reads = pysam.Samfile(poptions.outfile, "wbu", template=unfiltered_reads) -while(1): +while 1: + # pull the reads - # pull the reads + try: + read1 = unfiltered_reads.next() + read2 = unfiltered_reads.next() + (read1, read2) = (read1, read2) if read1.is_read1 else (read2, read1) - try: + except: + break - read1 = unfiltered_reads.next() - read2 = unfiltered_reads.next() - (read1, read2) = (read1, read2) if read1.is_read1 else (read2, read1) + # strip off the umi, and place it in a custom tag (if it exists) - except: + if poptions.umi: + try: + read1_umi_loc = read1.qname.index("#") + read2_umi_loc = read2.qname.index("#") - break + except: + pass - # strip off the umi, and place it in a custom tag (if it exists) + else: + read1.setTag("XD", read1.qname[read1_umi_loc + 1 :]) + read1.qname = read1.qname[:read1_umi_loc] - if poptions.umi: - try: + read2.setTag("XD", read2.qname[read2_umi_loc + 1 :]) + read2.qname = read2.qname[:read2_umi_loc] - read1_umi_loc = read1.qname.index('#') - read2_umi_loc = read2.qname.index('#') + # filtering code - except: + try: + # must be mapped to opposite strands (F-R conformation) + if read1.is_reverse == read2.is_reverse: + raise - pass + # both reads must have mapq greater than 0 + if read1.mapq == 0 or read2.mapq == 0: + raise - else: + # each read must map to the same contig + if read1.tid != read2.tid: + raise - read1.setTag("XD", read1.qname[read1_umi_loc+1:]) - read1.qname = read1.qname[:read1_umi_loc] + # each read pair must be have an insert length of <=750 nt + if read1.isize > 750 or read2.isize > 750: + raise - read2.setTag("XD", read2.qname[read2_umi_loc+1:]) - read2.qname = read2.qname[:read2_umi_loc] + except: + # failed a test above, not properly paired + read1.flag &= ~(1 << 1) + read2.flag &= ~(1 << 1) - # filtering code + else: + # pass all criteria, properly paired + read1.flag |= 1 << 1 + read2.flag |= 1 << 1 - try: - - # must be mapped to opposite strands (F-R conformation) - if read1.is_reverse == read2.is_reverse: raise - - # both reads must have mapq greater than 0 - if read1.mapq == 0 or read2.mapq == 0: raise - - # each read must map to the same contig - if read1.tid != read2.tid: raise - - # each read pair must be have an insert length of <=750 nt - if read1.isize > 750 or read2.isize > 750: raise - - except: - - # failed a test above, not properly paired - read1.flag &= ~(1<<1) - read2.flag &= ~(1<<1) - - else: - - # pass all criteria, properly paired - read1.flag |= (1<<1) - read2.flag |= (1<<1) - - finally: - - # write to file - filtered_reads.write(read1) - filtered_reads.write(read2) + finally: + # write to file + filtered_reads.write(read1) + filtered_reads.write(read2) # clean-up and close files unfiltered_reads.close() filtered_reads.close() - diff --git a/scripts/cluster/monitor_alignments.py b/scripts/cluster/monitor_alignments.py index 72e44c21..64f67bee 100644 --- a/scripts/cluster/monitor_alignments.py +++ b/scripts/cluster/monitor_alignments.py @@ -14,7 +14,7 @@ base_api_url = None log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -log = logging.getLogger('upload_data.py') +log = logging.getLogger("upload_data.py") script_options = { "base_api_url": None, @@ -26,38 +26,59 @@ ALIGN_REGEX = re.compile(r"ALIGN#(\d+)") + def run_command(args): return subprocess.check_output(args, stderr=subprocess.STDOUT).decode("utf-8") -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - - parser.add_argument("-c", "--cluster", dest="cluster", - help="The type of cluster: SGE (default) or SLURM") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + + parser.add_argument( + "-c", + "--cluster", + dest="cluster", + help="The type of cluster: SGE (default) or SLURM", + ) return parser + class SGEChecker(object): QSTAT_COMMAND = ["qstat", "-xml"] QHOST_COMMAND = ["qhost"] def parse_jobnames(self): qstat_xml = run_command(self.QSTAT_COMMAND) - dom=xml.dom.minidom.parseString(qstat_xml) + dom = xml.dom.minidom.parseString(qstat_xml) - jobnames = dom.getElementsByTagName('JB_name') + jobnames = dom.getElementsByTagName("JB_name") alignments = set() @@ -76,9 +97,7 @@ def get_host_info(self): return qhost - class SLURMChecker(object): - SQUEUE_COMMAND = ["squeue", "-o", "%j", "--noheader"] SNODES_COMMAND = ["scontrol", "show", "nodes"] @@ -96,14 +115,14 @@ def get_host_info(self): snodes = run_command(self.SNODES_COMMAND) return snodes -class ClusterMonitor(object): +class ClusterMonitor(object): checker = None def __init__(self, api_url, token, cluster_type=None): self.api_url = api_url self.token = token - self.headers = {'Authorization': "Token %s" % token} + self.headers = {"Authorization": "Token %s" % token} if cluster_type is None: cluster_type = "sge" cluster_type = cluster_type.lower() @@ -115,9 +134,7 @@ def __init__(self, api_url, token, cluster_type=None): log.critical("Invalid cluster type %s", cluster_type) sys.exit(1) - def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -125,7 +142,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = requests.get(url, headers=self.headers) @@ -150,31 +166,41 @@ def run(self): mark_alignments = currently_running - marked_running if mark_alignments: log.info("Marking alignments as processing: %s" % str(mark_alignments)) - [self.update_processing_status(align_id, True) for align_id in mark_alignments] + [ + self.update_processing_status(align_id, True) + for align_id in mark_alignments + ] # What alignments are currently marked but not running? finished_alignments = marked_running - currently_running if finished_alignments: log.info("Alignments no longer processing: %s" % str(finished_alignments)) - [self.update_processing_status(align_id, False) for align_id in finished_alignments] + [ + self.update_processing_status(align_id, False) + for align_id in finished_alignments + ] self.update_host_info() def update_processing_status(self, align_id, processing=True): - patch_url = "%s/flowcell_lane_alignment/%d/" % (self.api_url, align_id) - update_result = requests.patch(patch_url, headers = self.headers, data={'currently_processing': processing}) + update_result = requests.patch( + patch_url, headers=self.headers, data={"currently_processing": processing} + ) if not update_result.ok: - log.critical("Could not update alignment %d: %s" % (align_id, str(update_result))) + log.critical( + "Could not update alignment %d: %s" % (align_id, str(update_result)) + ) return False return True def lims_currently_processing(self): - - fetch_results = self.api_list_result("flowcell_lane_alignment/?currently_processing=True") + fetch_results = self.api_list_result( + "flowcell_lane_alignment/?currently_processing=True" + ) lims_process_align_ids = set() if fetch_results == None: @@ -182,11 +208,12 @@ def lims_currently_processing(self): sys.exit(1) for result in fetch_results: - lims_process_align_ids.add(result['id']) - log.info("Currently marked as processing on LIMS: %s" % str(lims_process_align_ids)) + lims_process_align_ids.add(result["id"]) + log.info( + "Currently marked as processing on LIMS: %s" % str(lims_process_align_ids) + ) return lims_process_align_ids - def update_host_info(self): host_usage = self.checker.get_host_info() @@ -195,10 +222,12 @@ def update_host_info(self): url = "%s/key_value/?key=%s" % (self.api_url, key) key_value = self.get_single_result("%s/key_value/?key=%s" % (self.api_url, key)) if not key_value: - log.error("Cannot find \'%s\' key value" % key) + log.error("Cannot find '%s' key value" % key) return - update = requests.patch(key_value["url"], data={"value": host_usage}, headers=self.headers) + update = requests.patch( + key_value["url"], data={"value": host_usage}, headers=self.headers + ) if update.ok: log.info(update.json()) @@ -206,24 +235,24 @@ def update_host_info(self): log.error("Could not update %s usage." % host) log.error(update.text) - - def get_single_result(self, fetch_url, field=None): """ Using a list API url that should bring up a single item, retrieve that single item if it exists. """ - fetch_results = requests.get(fetch_url, headers = self.headers) + fetch_results = requests.get(fetch_url, headers=self.headers) if fetch_results.ok: results = fetch_results.json() - if results['count'] > 1: + if results["count"] > 1: log.error("More than one matching item for fetch query: %s" % fetch_url) - elif results['count'] == 0: + elif results["count"] == 0: log.debug("No matching items for fetch query: %s" % fetch_url) else: - result = results['results'][0] - log.debug("Single result fetched from %s: %s" % (fetch_url, str(result))) + result = results["results"][0] + log.debug( + "Single result fetched from %s: %s" % (fetch_url, str(result)) + ) if field: return result[field] return result @@ -232,9 +261,10 @@ def get_single_result(self, fetch_url, field=None): return None -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -260,7 +290,6 @@ def main(args = sys.argv): sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) - if not poptions.token and "LIMS_API_TOKEN" in os.environ: token = os.environ["LIMS_API_TOKEN"] elif poptions.token: @@ -273,6 +302,7 @@ def main(args = sys.argv): monitor.run() + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index cf654ead..41c0bf0d 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -22,7 +22,9 @@ "send_from": "sendfrom@email.com" } """ -config_filename = os.path.join(os.environ["STAMPIPES_DATA"], "flowcell_notify_config.json") +config_filename = os.path.join( + os.environ["STAMPIPES_DATA"], "flowcell_notify_config.json" +) # This file will be checked for in the folders # will need to append it with the read # @@ -32,18 +34,24 @@ runinfo_filename = "RunInfo.xml" # Format of folders: 090810_SOLEXA-1GA-1_0016_FC82IU -folder_pattern_ga = re.compile("(?P\d{6})_SOLEXA-1GA-[12]_\d{4,5}_FC(?P[A-Z0-9]{5})") +folder_pattern_ga = re.compile( + "(?P\d{6})_SOLEXA-1GA-[12]_\d{4,5}_FC(?P[A-Z0-9]{5})" +) # 140703_SN373_0524_BC6TATACXX # 140710_D00453_0080_AC5PBPANXX -folder_pattern_hiseq = re.compile("(?P\d{6})_(?P(SN|D)\d+)_[0-9]+_(A|B)(?P[A-Z0-9]{5})[A-Z]{2}XX") +folder_pattern_hiseq = re.compile( + "(?P\d{6})_(?P(SN|D)\d+)_[0-9]+_(A|B)(?P[A-Z0-9]{5})[A-Z]{2}XX" +) # 140808_NS500372_0009_AH115HBGXX -folder_pattern_nextseq = re.compile("(?P\d{6})_(?PNS500\d+)_[0-9]+_A(?P[A-Z0-9]{5})[A-Z]{2}XX") +folder_pattern_nextseq = re.compile( + "(?P\d{6})_(?PNS500\d+)_[0-9]+_A(?P[A-Z0-9]{5})[A-Z]{2}XX" +) # To use with datetime.strptime() to translate folder dates folder_datepattern = "%y%m%d" # Waiting period, in seconds -wait = 60 * 60 # every hour +wait = 60 * 60 # every hour config_json = open(config_filename) @@ -52,7 +60,7 @@ config_json.close() # Base folders to check for sequencing run folders -base_folders = config["base_folders"] +base_folders = config["base_folders"] # WARNING: CURRENTLY ONLY STAMLAB.ORG EMAILS WORK @@ -69,24 +77,30 @@ check_folders = dict() flowcell_reads = dict() + def get_sequencer_folders(): """Gets all the sequence folders in the base folders.""" folder_list = list() for base_folder in base_folders.values(): - [folder_list.append("%s/%s" % (base_folder, sequencer_folder)) for sequencer_folder - in os.listdir(base_folder) - if os.path.isdir("%s/%s" % (base_folder, sequencer_folder)) and - (folder_pattern_ga.match(sequencer_folder) or \ - folder_pattern_hiseq.match(sequencer_folder) or \ - folder_pattern_nextseq.match(sequencer_folder))] + [ + folder_list.append("%s/%s" % (base_folder, sequencer_folder)) + for sequencer_folder in os.listdir(base_folder) + if os.path.isdir("%s/%s" % (base_folder, sequencer_folder)) + and ( + folder_pattern_ga.match(sequencer_folder) + or folder_pattern_hiseq.match(sequencer_folder) + or folder_pattern_nextseq.match(sequencer_folder) + ) + ] return folder_list + def get_folder_reads(sequencer_folder): - runinfo_file=os.path.join(sequencer_folder, runinfo_filename) + runinfo_file = os.path.join(sequencer_folder, runinfo_filename) try: runinfodoc = minidom.parse(runinfo_file) - return len(runinfodoc.getElementsByTagName('Read')) + return len(runinfodoc.getElementsByTagName("Read")) except IOError: logging.info("Could not read %s" % runinfo_file) return None @@ -94,15 +108,17 @@ def get_folder_reads(sequencer_folder): logging.info("%s is malformatted" % runinfo_file) return None + def check_copy(sequencer_folder): """Checks to see if the given copy filename is present in the sequencer folder""" if flowcell_reads[sequencer_folder]: - #copy_filename = copy_complete_filename % flowcell_reads[sequencer_folder] + # copy_filename = copy_complete_filename % flowcell_reads[sequencer_folder] return os.path.exists("%s/%s" % (sequencer_folder, "CopyComplete.txt")) else: return False + def load_folders(): """This function loads the initial folder states.""" logging.info("Loading folders") @@ -112,9 +128,13 @@ def load_folders(): check_folders[sequencer_folder] = check_copy(sequencer_folder) if flowcell_reads[sequencer_folder]: - logging.info("Initial state of %s: %s" % (sequencer_folder, str(check_folders[sequencer_folder]))) + logging.info( + "Initial state of %s: %s" + % (sequencer_folder, str(check_folders[sequencer_folder])) + ) else: - logging.info("Initial state of %s: does not have reads" % sequencer_folder) + logging.info("Initial state of %s: does not have reads" % sequencer_folder) + def check_folder(sequencer_folder): """Check a sequencer folder and notify for changes""" @@ -132,6 +152,7 @@ def check_folder(sequencer_folder): logging.debug("Number of reads: %s" % str(flowcell_reads[sequencer_folder])) notify_new(sequencer_folder) + def run_check(): """Checks for newly created folders and newly copied folders. Notifies if they are found.""" try: @@ -152,6 +173,7 @@ def run_check(): for sequencer_folder in folders: check_folder(sequencer_folder) + def get_folder_info(sequencer_folder): """Given a sequencer folder name, tries to find the server and flowcell name""" info = dict() @@ -175,29 +197,34 @@ def get_folder_info(sequencer_folder): info["folder"] = sequencer_folder return info + def notify_new(sequencer_folder): """Notify the list of emails of a finished copying folder.""" info = get_folder_info(sequencer_folder) send_email(notif_new_title % info, notif_new_body % info, notification_emails) + def notify_copy(sequencer_folder): """Notify the list of emails of a newly created folder.""" info = get_folder_info(sequencer_folder) send_email(notif_copy_title % info, notif_copy_body % info, notification_emails) + def send_email(subject, body, emails): msg = MIMEText(body) - msg['Subject'] = subject - msg['From'] = 'illumina@stamlab.org' - msg['To'] = ", ".join(emails) + msg["Subject"] = subject + msg["From"] = "illumina@stamlab.org" + msg["To"] = ", ".join(emails) - s = smtplib.SMTP('localhost') - s.sendmail('illumina@stamlab.org', emails, msg.as_string()) + s = smtplib.SMTP("localhost") + s.sendmail("illumina@stamlab.org", emails, msg.as_string()) s.quit() -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) # loads what's presently there so we don't notify about it load_folders() diff --git a/scripts/create_processing.py b/scripts/create_processing.py index 3d7d6ec4..fa075448 100644 --- a/scripts/create_processing.py +++ b/scripts/create_processing.py @@ -6,7 +6,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_files = { "bwa": "%s/processes/bwa/process_bwa_paired_trimmed.bash" % STAMPIPES, @@ -36,61 +36,138 @@ "umi_filter": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="The master script to run all sample scripts.") - parser.add_argument("-p", "--process-config", dest="process_config", - help="The process config to work off of.") - parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") - parser.add_argument("--project", dest="project_filter", action="append", - help="Run for this particular project. Can be specified multiple times.") - - parser.add_argument("--sample", dest="sample_filter", type=int, action="append", - help="Run for this particular sample. Can be specified multiple times.") - parser.add_argument("--library", dest="library_filter", type=int, action="append", - help="Run for this particular library. Can be specified multiple times.") - parser.add_argument("--alignment", dest="alignment_filter", type=int, action="append", - help="Run for this particular alignment. Can be specified multiple times.") - parser.add_argument("--lane", dest="lanes", type=int, action="append", - help="Run for these given lanes. Can be specified multiple times.") - parser.add_argument("--no_umi", dest="umi_filter", action="store_false", - help="Run and don't include any UMI lanes.") - parser.add_argument("--only_umi", dest="umi_filter", action="store_true", - help="Run and only include UMI lanes.") - - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - parser.add_argument("-t", "--template-script", dest="template_script", - help="Template script to make for each valid library if not defaults") - parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="If this is set to true, remake SAMPLE_NAME with no barcode mask.") - parser.add_argument("--add_flowcell_scripts", dest="add_flowcell_scripts", action="store_true", - help="If this is set to true, add on flowcell level scripts to the end of the run collection.") - - parser.add_argument("-i", "--ignore_failed_lanes", dest="ignore_failed_lanes", action="store_true", - help="Ignore failed lanes") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="The master script to run all sample scripts.", + ) + parser.add_argument( + "-p", + "--process-config", + dest="process_config", + help="The process config to work off of.", + ) + parser.add_argument( + "-b", + "--sample-script-basename", + dest="sample_script_basename", + help="Name of the script that goes after the sample name.", + ) + parser.add_argument( + "--project", + dest="project_filter", + action="append", + help="Run for this particular project. Can be specified multiple times.", + ) + + parser.add_argument( + "--sample", + dest="sample_filter", + type=int, + action="append", + help="Run for this particular sample. Can be specified multiple times.", + ) + parser.add_argument( + "--library", + dest="library_filter", + type=int, + action="append", + help="Run for this particular library. Can be specified multiple times.", + ) + parser.add_argument( + "--alignment", + dest="alignment_filter", + type=int, + action="append", + help="Run for this particular alignment. Can be specified multiple times.", + ) + parser.add_argument( + "--lane", + dest="lanes", + type=int, + action="append", + help="Run for these given lanes. Can be specified multiple times.", + ) + parser.add_argument( + "--no_umi", + dest="umi_filter", + action="store_false", + help="Run and don't include any UMI lanes.", + ) + parser.add_argument( + "--only_umi", + dest="umi_filter", + action="store_true", + help="Run and only include UMI lanes.", + ) + + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + parser.add_argument( + "-t", + "--template-script", + dest="template_script", + help="Template script to make for each valid library if not defaults", + ) + parser.add_argument( + "--no-mask", + dest="no_mask", + action="store_true", + help="If this is set to true, remake SAMPLE_NAME with no barcode mask.", + ) + parser.add_argument( + "--add_flowcell_scripts", + dest="add_flowcell_scripts", + action="store_true", + help="If this is set to true, add on flowcell level scripts to the end of the run collection.", + ) + + parser.add_argument( + "-i", + "--ignore_failed_lanes", + dest="ignore_failed_lanes", + action="store_true", + help="Ignore failed lanes", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - - def __init__(self, args): - + def __init__(self, args): self.processing_configfile = args.process_config self.qsub_scriptname = args.sample_script_basename self.qsub_prefix = args.qsub_prefix @@ -108,10 +185,9 @@ def __init__(self, args): self.filter_lanes = args.lanes if self.template_script: - self.template_script_content = open(self.template_script, 'r').read() + self.template_script_content = open(self.template_script, "r").read() def include_lane(self, lane): - if self.umi_filter != None: if lane["barcode1"] and lane["barcode1"]["umi"]: umi = True @@ -127,32 +203,47 @@ def include_lane(self, lane): return False if self.ignore_failed_lanes and lane["failed"]: - logging.debug("Skipping %s, failed and we are ignoring failed lanes" % lane["samplesheet_name"]) + logging.debug( + "Skipping %s, failed and we are ignoring failed lanes" + % lane["samplesheet_name"] + ) return False if self.project_filter and not (lane["project"] in self.project_filter): - logging.debug("Skipping %s, not in project filter" % lane["samplesheet_name"]) + logging.debug( + "Skipping %s, not in project filter" % lane["samplesheet_name"] + ) return False if self.library_filter and not (lane["library"] in self.library_filter): - logging.debug("Skipping %s, not in library filter" % lane["samplesheet_name"]) + logging.debug( + "Skipping %s, not in library filter" % lane["samplesheet_name"] + ) return False if self.sample_filter and not (lane["sample"] in self.sample_filter): - logging.debug("Skipping %s, not in sample filter" % lane["samplesheet_name"]) + logging.debug( + "Skipping %s, not in sample filter" % lane["samplesheet_name"] + ) return False - if self.alignment_filter and lane["alignments"] and not (lane["alignments"][0]["id"] in self.alignment_filter): - logging.debug("Skipping %s, not in alignment filter" % lane["samplesheet_name"]) + if ( + self.alignment_filter + and lane["alignments"] + and not (lane["alignments"][0]["id"] in self.alignment_filter) + ): + logging.debug( + "Skipping %s, not in alignment filter" % lane["samplesheet_name"] + ) return False return True def create(self): self.processing_scripts = dict() - self.p = json.loads(open(self.processing_configfile, 'r').read()) + self.p = json.loads(open(self.processing_configfile, "r").read()) - for lane in self.p['libraries']: + for lane in self.p["libraries"]: if self.include_lane(lane): self.create_script(lane) @@ -164,94 +255,113 @@ def create(self): self.run_scripts() def add_script(self, script_file, sample_name, priority): - if not priority in self.processing_scripts: self.processing_scripts[priority] = list() self.processing_scripts[priority].append((sample_name, script_file)) def run_scripts(self): - if self.dry_run: return - outfile = open(self.outfile, 'w') + outfile = open(self.outfile, "w") for priority in sorted(self.processing_scripts.keys(), reverse=True): outfile.write("# Priority %s\n" % str(priority)) for sample_name, script_file in self.processing_scripts[priority]: outfile.write("cd %s && " % os.path.dirname(script_file)) - outfile.write("qsub -N %s%s-%s -cwd -V -S /bin/bash %s\n\n" % (self.qsub_prefix, sample_name, self.p['flowcell']['label'], script_file)) + outfile.write( + "qsub -N %s%s-%s -cwd -V -S /bin/bash %s\n\n" + % ( + self.qsub_prefix, + sample_name, + self.p["flowcell"]["label"], + script_file, + ) + ) outfile.close() def get_script_template(self, lane): - if self.template_script: return self.template_script_content alignment = lane["alignments"][0] - if not alignment['aligner']: - logging.info("# FastQC only %s" % lane['sample']) + if not alignment["aligner"]: + logging.info("# FastQC only %s" % lane["sample"]) base_script = "fastqc" else: base_script = alignment["aligner"] - logging.info("# Aligning %s with %s" % ( lane['sample'], base_script )) + logging.info("# Aligning %s with %s" % (lane["sample"], base_script)) if not base_script in script_contents: - script_contents[base_script] = open(script_files[base_script], 'r').read() + script_contents[base_script] = open(script_files[base_script], "r").read() return script_contents[base_script] # Probably ripe for a refactoring def create_flowcell_script(self, inscript): - script_directory = os.path.join(self.p['alignment_group']['directory'], 'flowcell_scripts') + script_directory = os.path.join( + self.p["alignment_group"]["directory"], "flowcell_scripts" + ) if not os.path.exists(script_directory): logging.info("Creating directory %s" % script_directory) os.makedirs(script_directory) - script_file = os.path.join(script_directory, - os.path.basename(inscript)) + script_file = os.path.join(script_directory, os.path.basename(inscript)) # TODO: Figure out the appropriate priority here. - self.add_script(script_file, 'flowcell_script', 0) + self.add_script(script_file, "flowcell_script", 0) if self.dry_run: return - outfile = open(script_file, 'w') + outfile = open(script_file, "w") outfile.write("set -e -o pipefail\n") - outfile.write("export READLENGTH=%s\n" % self.p['flowcell']['read_length']) - if self.p['flowcell']['paired_end']: + outfile.write("export READLENGTH=%s\n" % self.p["flowcell"]["read_length"]) + if self.p["flowcell"]["paired_end"]: outfile.write("export PAIRED=True\n") - - outfile.write("export FLOWCELL=%s\n" % self.p['flowcell']['label']) - outfile.write("export FLOWCELL_DIR=%s\n" % self.p['alignment_group']['directory']) - outfile.write("export PROCESSING=%s\n" % os.path.abspath(self.processing_configfile)) + outfile.write("export FLOWCELL=%s\n" % self.p["flowcell"]["label"]) + outfile.write( + "export FLOWCELL_DIR=%s\n" % self.p["alignment_group"]["directory"] + ) + outfile.write( + "export PROCESSING=%s\n" % os.path.abspath(self.processing_configfile) + ) outfile.write("\n") outfile.close() - os.system("cat %s >> %s" % ( inscript, script_file ) ) - + os.system("cat %s >> %s" % (inscript, script_file)) def create_script(self, lane): - if not lane["alignments"]: return False alignment = lane["alignments"][0] - fastq_directory = os.path.join(self.p['alignment_group']['directory'], "Project_%s" % lane['project'], "Sample_%s" % lane['samplesheet_name']) + fastq_directory = os.path.join( + self.p["alignment_group"]["directory"], + "Project_%s" % lane["project"], + "Sample_%s" % lane["samplesheet_name"], + ) # Reset the alignment's sample name if we decied not to use the barcode index mask if self.no_mask: - alignment['sample_name'] = "%s_%s_L00%d" % (lane['samplesheet_name'], lane['barcode_index'], lane['lane']) - - align_dir = "align_%d_%s_%s" % (alignment['id'], alignment['genome_index'], alignment['aligner']) - if alignment['aligner_version']: - align_dir = "%s-%s" % (align_dir, alignment['aligner_version']) + alignment["sample_name"] = "%s_%s_L00%d" % ( + lane["samplesheet_name"], + lane["barcode_index"], + lane["lane"], + ) + + align_dir = "align_%d_%s_%s" % ( + alignment["id"], + alignment["genome_index"], + alignment["aligner"], + ) + if alignment["aligner_version"]: + align_dir = "%s-%s" % (align_dir, alignment["aligner_version"]) script_directory = "%s/%s" % (fastq_directory, align_dir) @@ -259,41 +369,43 @@ def create_script(self, lane): logging.info("Creating directory %s" % script_directory) os.makedirs(script_directory) - script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) + script_file = os.path.join( + script_directory, "%s-%s" % (alignment["sample_name"], self.qsub_scriptname) + ) logging.info(script_file) - self.add_script(script_file, alignment['sample_name'], alignment['priority']) + self.add_script(script_file, alignment["sample_name"], alignment["priority"]) if self.dry_run: return - outfile = open(script_file, 'w') + outfile = open(script_file, "w") outfile.write("set -e -o pipefail\n") - outfile.write("export SAMPLE_NAME=%s\n" % alignment['sample_name']) - outfile.write("export BWAINDEX=%s\n" % alignment['genome_index_location']) - outfile.write("export GENOME=%s\n" % alignment['genome_index']) - outfile.write("export ASSAY=%s\n" % lane['assay']) - outfile.write("export READLENGTH=%s\n" % self.p['flowcell']['read_length']) - if self.p['flowcell']['paired_end']: + outfile.write("export SAMPLE_NAME=%s\n" % alignment["sample_name"]) + outfile.write("export BWAINDEX=%s\n" % alignment["genome_index_location"]) + outfile.write("export GENOME=%s\n" % alignment["genome_index"]) + outfile.write("export ASSAY=%s\n" % lane["assay"]) + outfile.write("export READLENGTH=%s\n" % self.p["flowcell"]["read_length"]) + if self.p["flowcell"]["paired_end"]: outfile.write("export PAIRED=True\n") - outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane['id']) - outfile.write("export ALIGNMENT_ID=%s\n" % alignment['id']) + outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane["id"]) + outfile.write("export ALIGNMENT_ID=%s\n" % alignment["id"]) outfile.write("export ALIGN_DIR=%s/%s\n" % (fastq_directory, align_dir)) outfile.write("export FASTQ_DIR=%s\n" % fastq_directory) - outfile.write("export FLOWCELL=%s\n" % self.p['flowcell']['label']) + outfile.write("export FLOWCELL=%s\n" % self.p["flowcell"]["label"]) if "barcode1" in lane and lane["barcode1"]: - p7_adapter = lane['barcode1']['adapter7'] - p5_adapter = lane['barcode1']['adapter5'] - if "barcode2" in lane and lane['barcode2']: + p7_adapter = lane["barcode1"]["adapter7"] + p5_adapter = lane["barcode1"]["adapter5"] + if "barcode2" in lane and lane["barcode2"]: # Override the "default" end adapter from barcode1 # TODO: Make sure we want adapter7, double-check lims methods - p5_adapter = lane['barcode2']['adapter7'] + p5_adapter = lane["barcode2"]["adapter7"] outfile.write("export ADAPTER_P7=%s\n" % p7_adapter) outfile.write("export ADAPTER_P5=%s\n" % p5_adapter) # Process with UMI if the barcode has one and this is a dual index # flowcell - if lane['barcode1']['umi'] and self.p['flowcell']['dual_index']: + if lane["barcode1"]["umi"] and self.p["flowcell"]["dual_index"]: outfile.write("export UMI=True\n") outfile.write("\n") @@ -301,9 +413,9 @@ def create_script(self, lane): outfile.close() -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -319,6 +431,7 @@ def main(args = sys.argv): process = ProcessSetUp(poptions) process.create() + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/fastq/takara_umt.py b/scripts/fastq/takara_umt.py index 78fa1137..c7b06744 100755 --- a/scripts/fastq/takara_umt.py +++ b/scripts/fastq/takara_umt.py @@ -6,29 +6,33 @@ LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -UMI_LEN = 8 # Fixed length for takara RNA UMTs +UMI_LEN = 8 # Fixed length for takara RNA UMTs STEM_LEN = 6 TRIM_LEN = UMI_LEN + STEM_LEN def parse_args(): - """ Just parse the args """ + """Just parse the args""" parser = argparse.ArgumentParser( - description='Annotate read names with Takara v3 UMT') + description="Annotate read names with Takara v3 UMT" + ) parser.add_argument( - '--readlength', required=True, type=int, - help="The length of each fastq file - used for trimming R1") - parser.add_argument('r1_fastq') - parser.add_argument('r2_fastq') - parser.add_argument('out_r1') - parser.add_argument('out_r2') + "--readlength", + required=True, + type=int, + help="The length of each fastq file - used for trimming R1", + ) + parser.add_argument("r1_fastq") + parser.add_argument("r2_fastq") + parser.add_argument("out_r1") + parser.add_argument("out_r2") args = parser.parse_args() return args def attach_umt(r1, r2, maxlen): - """ Attach UMT to r1 & r2 and remove from sequences """ + """Attach UMT to r1 & r2 and remove from sequences""" # Put UMT in names umt = r2.seq[:UMI_LEN] umt_add = "#%s" % (umt) @@ -59,15 +63,13 @@ def main(): fragment_count = 0 trim_count = 0 removed_count = 0 - with open(args.r1_fastq) as r1_in, \ - open(args.r2_fastq) as r2_in, \ - open(args.out_r1, 'wt') as r1_out, \ - open(args.out_r2, 'wt') as r2_out: - + with open(args.r1_fastq) as r1_in, open(args.r2_fastq) as r2_in, open( + args.out_r1, "wt" + ) as r1_out, open(args.out_r2, "wt") as r2_out: r1_seq_io = SeqIO.parse(r1_in, "fastq") r2_seq_io = SeqIO.parse(r2_in, "fastq") - for (r1, r2) in zip(r1_seq_io, r2_seq_io): + for r1, r2 in zip(r1_seq_io, r2_seq_io): fragment_count += 1 (r1, r2, trimmed) = attach_umt(r1, r2, args.readlength) if trimmed: diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py index 339a885b..8600430f 100644 --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -4,17 +4,34 @@ MAX_BARCODE_LENGTH = 10 + def parseArgs(): - parser = argparse.ArgumentParser(description='Split up fastq files by barcode') - parser.add_argument('--processing', dest='processing_file', action='store', required=True, - help='processing.json to use (mandatory)') - parser.add_argument('--barcodes', dest='barcodes_file', action='store',required=True, - help='barcode output to compare') - parser.add_argument('--bcmask', dest='barcodes_mask', action='store',required=True, - help='barcode mask to assess') + parser = argparse.ArgumentParser(description="Split up fastq files by barcode") + parser.add_argument( + "--processing", + dest="processing_file", + action="store", + required=True, + help="processing.json to use (mandatory)", + ) + parser.add_argument( + "--barcodes", + dest="barcodes_file", + action="store", + required=True, + help="barcode output to compare", + ) + parser.add_argument( + "--bcmask", + dest="barcodes_mask", + action="store", + required=True, + help="barcode mask to assess", + ) args = parser.parse_args() return args + # Generates the barcode reporting mask from processing.json # Returns a list of all barcode lengths represented in the data def get_barcode_masks(json_data): @@ -43,28 +60,33 @@ def format_difference(x): return masks + # format length def format_length(x): - if (x): + if x: return len(x["sequence"]) else: - return '0' + return "0" + # Determines how many different sizes of barcodes there are and returns a set of them def get_barcode_lengths(json_data): - # set of each unique length in the data - lengths = set([ "{}-{}".format( - format_length(lib["barcode1"]), - format_length(lib["barcode2"])) - for lib in json_data["libraries"] ]) + lengths = set( + [ + "{}-{}".format( + format_length(lib["barcode1"]), format_length(lib["barcode2"]) + ) + for lib in json_data["libraries"] + ] + ) # Make sure only 1 report is run each for single/dual indexed barcodes until reporting is more flexible tempbc1, tempbc2 = [], [] finalList = [] for n in lengths: - if n[2] == '0': + if n[2] == "0": tempbc1.append(n) else: tempbc2.append(n) @@ -75,6 +97,7 @@ def get_barcode_lengths(json_data): return finalList + def main(argv): args = parseArgs() barcodes = json.load(open(args.barcodes_file)) @@ -85,29 +108,27 @@ def main(argv): barcode_lengths = get_barcode_lengths(process) masks = get_barcode_masks(process) - for lib in process['libraries']: - + for lib in process["libraries"]: # only look at libraries with the same mask length1 = format_length(lib["barcode1"]) length2 = format_length(lib["barcode2"]) lengths = str(length1) + "-" + str(length2) index = barcode_lengths.index(lengths) if mask == masks[index]: - # check to see if the barcode is represent in the report bcs = "" - if lib['barcode2'] is not None: - bc1 = lib['barcode1']['reverse_sequence'] - bc2 = lib['barcode2']['sequence'] - bcs = bc1 + bc2 + if lib["barcode2"] is not None: + bc1 = lib["barcode1"]["reverse_sequence"] + bc2 = lib["barcode2"]["sequence"] + bcs = bc1 + bc2 else: - bc1 = lib['barcode1']['reverse_sequence'] - bcs = bc1 + bc1 = lib["barcode1"]["reverse_sequence"] + bcs = bc1 - lane = lib['lane'] - for l in barcodes['Lanes']: - if lane == l['LaneIndex']: - if bcs in l['Counts']: + lane = lib["lane"] + for l in barcodes["Lanes"]: + if lane == l["LaneIndex"]: + if bcs in l["Counts"]: next else: print(lib) @@ -117,5 +138,6 @@ def main(argv): print(success) + if __name__ == "__main__": main(sys.argv) diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py index b14ba3c3..f3cd372e 100644 --- a/scripts/flowcells/barcode_count_from_stats_file.py +++ b/scripts/flowcells/barcode_count_from_stats_file.py @@ -9,44 +9,63 @@ "min_count": 1000000, } + def parser_setup(): parser = argparse.ArgumentParser() # Optional - parser.add_argument("-c", "--min-count", type=int, dest="min_count", - help="The minimum number of reads to report") + parser.add_argument( + "-c", + "--min-count", + type=int, + dest="min_count", + help="The minimum number of reads to report", + ) # Mandatory - parser.add_argument("-s", "--stats", dest="stats_file", - required=True, - help="The JSON file to read stats from. Generally fastq/Stats/Stats.json") - parser.add_argument("-b", "--basedir", dest="base_dir", - required=True, - help="The base directory, like /net/seq/data/sequencers/DATE_A#####_####_FLOWCELL_LABEL") - parser.add_argument("-m", "--mask", dest="mask", - required=True, - help="The barcode mask, like y151,i8,i8,y151") + parser.add_argument( + "-s", + "--stats", + dest="stats_file", + required=True, + help="The JSON file to read stats from. Generally fastq/Stats/Stats.json", + ) + parser.add_argument( + "-b", + "--basedir", + dest="base_dir", + required=True, + help="The base directory, like /net/seq/data/sequencers/DATE_A#####_####_FLOWCELL_LABEL", + ) + parser.add_argument( + "-m", + "--mask", + dest="mask", + required=True, + help="The barcode mask, like y151,i8,i8,y151", + ) - parser.set_defaults( **default_options ) + parser.set_defaults(**default_options) return parser + def main(): parser = parser_setup() poptions = parser.parse_args() odata = { - "Lanes": [], + "Lanes": [], "Mask": poptions.mask, "Sequencer": "NovaSeq", "BaseDir": poptions.base_dir, } with open(poptions.stats_file) as f: idata = json.load(f) - + for lane in idata["UnknownBarcodes"]: olane = { "LaneIndex": lane["Lane"], "Total": None, "Pass": None, "Counts": { - bc.replace("+",""): { "Total": count, "Pass": count } + bc.replace("+", ""): {"Total": count, "Pass": count} for (bc, count) in lane["Barcodes"].items() if count > poptions.min_count }, @@ -57,26 +76,29 @@ def main(): odata["Lanes"].append(olane) for conversion_result in idata["ConversionResults"]: - lane_num = conversion_result["LaneNumber"] - lane_idx = None - for (i, olane) in enumerate(odata["Lanes"]): - if int(olane["LaneIndex"]) == int(lane_num): - lane_idx = i - break - if lane_idx is None: - logging.error("Lane %s not in odata", lane_num) - for sample_info in conversion_result["DemuxResults"]: - for metric_info in sample_info["IndexMetrics"]: - # Get matching count - barcode = metric_info["IndexSequence"].replace("+","") - count = metric_info["MismatchCounts"]["0"] - # Update out_data - odata["Lanes"][lane_idx]["Counts"][barcode] = {"Total": count, "Pass": count} - odata["Lanes"][lane_idx]["Total"] += count - odata["Lanes"][lane_idx]["Pass"] += count - + lane_num = conversion_result["LaneNumber"] + lane_idx = None + for i, olane in enumerate(odata["Lanes"]): + if int(olane["LaneIndex"]) == int(lane_num): + lane_idx = i + break + if lane_idx is None: + logging.error("Lane %s not in odata", lane_num) + for sample_info in conversion_result["DemuxResults"]: + for metric_info in sample_info["IndexMetrics"]: + # Get matching count + barcode = metric_info["IndexSequence"].replace("+", "") + count = metric_info["MismatchCounts"]["0"] + # Update out_data + odata["Lanes"][lane_idx]["Counts"][barcode] = { + "Total": count, + "Pass": count, + } + odata["Lanes"][lane_idx]["Total"] += count + odata["Lanes"][lane_idx]["Pass"] += count print(json.dumps(odata)) + if __name__ == "__main__": main() diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py index f49bc422..7ae3e4ad 100644 --- a/scripts/flowcells/barcode_masks.py +++ b/scripts/flowcells/barcode_masks.py @@ -12,8 +12,12 @@ def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-p", "--processing", dest="processing", - help="The JSON file to read barcodes from") + parser.add_argument( + "-p", + "--processing", + dest="processing", + help="The JSON file to read barcodes from", + ) parser.set_defaults(**script_options) return parser @@ -53,24 +57,29 @@ def format_difference(x): def get_barcode_lengths(json_data): # in case barcode sequence is null max_len = json_data["flowcell"]["index_length"] + def format_length(x): - if (x): + if x: return min(max_len, len(x["sequence"])) else: return 0 # set of each unique length in the data - lengths = set(["{}-{}".format( - format_length(lib["barcode1"]), - format_length(lib["barcode2"])) - for lib in json_data["libraries"]]) + lengths = set( + [ + "{}-{}".format( + format_length(lib["barcode1"]), format_length(lib["barcode2"]) + ) + for lib in json_data["libraries"] + ] + ) # Make sure only 1 report is run each for single/dual indexed barcodes until reporting is more flexible tempbc1, tempbc2 = [], [] finalList = [] for n in lengths: - if n[2] == '0': + if n[2] == "0": tempbc1.append(n) else: tempbc2.append(n) @@ -88,13 +97,20 @@ def detect_collisions(json_data): num_lanes = max([lib["lane"] for lib in json_data["libraries"]]) for i in range(num_lanes): - barcodes = sorted(lib["barcode_index"] for lib in json_data["libraries"] - if lib["lane"] == i+1 and not lib["failed"]) - if (len(barcodes) != len(set(barcodes))): - collision = [barcode for x, barcode in enumerate( - barcodes) if barcode == barcodes[x-1]] + barcodes = sorted( + lib["barcode_index"] + for lib in json_data["libraries"] + if lib["lane"] == i + 1 and not lib["failed"] + ) + if len(barcodes) != len(set(barcodes)): + collision = [ + barcode + for x, barcode in enumerate(barcodes) + if barcode == barcodes[x - 1] + ] logging.error( - "Collision on lane {}. Barcode(s): {}\n".format(i+1, collision)) + "Collision on lane {}. Barcode(s): {}\n".format(i + 1, collision) + ) sys.exit(1) return True return False @@ -115,7 +131,7 @@ def main(): json_data = json.load(process_json) process_json.close() - #detect_collisions(json_data) + # detect_collisions(json_data) print(" ".join(get_barcode_masks(json_data))) diff --git a/scripts/flowcells/barcode_report.py b/scripts/flowcells/barcode_report.py index 1224d6c6..5307d571 100755 --- a/scripts/flowcells/barcode_report.py +++ b/scripts/flowcells/barcode_report.py @@ -11,117 +11,165 @@ "threshold": 1000000, } + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-p", "--processing_file", dest="processing_file", - help="The processing file to use as a guide.") - - parser.add_argument("-b", "--basedir", dest="basedir", - help="The base flowcell directory") - parser.add_argument("-j", "--json", action="store_true", - help="Write output in JSON") - parser.add_argument("-t", "--threshold", dest="threshold", type=int, - help="The minimum cluster count") - parser.add_argument("-l", "--lane", dest="lane", type=int, - help="Report details for only the specified lane") - parser.set_defaults( **script_options ) + parser.add_argument( + "-p", + "--processing_file", + dest="processing_file", + help="The processing file to use as a guide.", + ) + + parser.add_argument( + "-b", "--basedir", dest="basedir", help="The base flowcell directory" + ) + parser.add_argument( + "-j", "--json", action="store_true", help="Write output in JSON" + ) + parser.add_argument( + "-t", + "--threshold", + dest="threshold", + type=int, + help="The minimum cluster count", + ) + parser.add_argument( + "-l", + "--lane", + dest="lane", + type=int, + help="Report details for only the specified lane", + ) + parser.set_defaults(**script_options) return parser + def sum_barcodes(input_files): totals = {} for infile in input_files: - f = open(infile, 'r') + f = open(infile, "r") for line in f: words = line.split() if len(words) != 2: continue count, barcode = words - barcode = barcode.replace('+', '-') + barcode = barcode.replace("+", "-") if not barcode in totals: totals[barcode] = 0 totals[barcode] += int(count) f.close() - return(totals) - + return totals + + def get_input_files_for_lane(data, lane, basedir): globs = [] files = [] - for l in data['libraries']: + for l in data["libraries"]: # TODO: Allow to work with NoIndex samples that span a lane - if l['lane'] == lane: + if l["lane"] == lane: name = "Project_%s/Sample_%s/%s_%s_L%03d_R1_???.barcodes.txt" % ( - l['project'], l['samplesheet_name'], l['samplesheet_name'], - l['realbarcode'], l['lane'] ) + l["project"], + l["samplesheet_name"], + l["samplesheet_name"], + l["realbarcode"], + l["lane"], + ) globs.append(os.path.join(basedir, name)) - globs.append("Undetermined_indices/Sample_lane%d/lane%d_Undetermined_L%03d_R1_???.barcodes.txt" % (lane, lane, lane)) + globs.append( + "Undetermined_indices/Sample_lane%d/lane%d_Undetermined_L%03d_R1_???.barcodes.txt" + % (lane, lane, lane) + ) for g in globs: files += glob.glob(g) return files + def apply_mask(mask, barcode_string): - orig_barcodes = barcode_string.split('-') + orig_barcodes = barcode_string.split("-") while len(orig_barcodes) < len(mask): - orig_barcodes.append(u'') - barcodes = [ orig_barcodes[i][:l] for (i, l) in enumerate(mask) ] + orig_barcodes.append("") + barcodes = [orig_barcodes[i][:l] for (i, l) in enumerate(mask)] return barcodes + def parse_bases_mask(mask_string): - mask = map(int, re.findall( r"""(?: i ( \d* ) )""", mask_string, re.X | re.I)) + mask = map(int, re.findall(r"""(?: i ( \d* ) )""", mask_string, re.X | re.I)) return mask + def get_expected_barcodes(data): - mask = parse_bases_mask(data['alignment_group']['bases_mask']) + mask = parse_bases_mask(data["alignment_group"]["bases_mask"]) libraries = {} - for l in data['libraries']: - if l['barcode_index'] == "NoIndex": + for l in data["libraries"]: + if l["barcode_index"] == "NoIndex": barcode = None else: - barcode = '-'.join( apply_mask(mask, l['barcode_index']) ) - lane = l['lane'] - l['realbarcode'] = barcode + barcode = "-".join(apply_mask(mask, l["barcode_index"])) + lane = l["lane"] + l["realbarcode"] = barcode if not lane in libraries: libraries[lane] = {} libraries[lane][barcode] = l return libraries + def merge_actual_and_expected(expected, actual): merged = {} - fields = ['samplesheet_name', 'id', 'purpose'] + fields = ["samplesheet_name", "id", "purpose"] if None in expected: - total_barcodes = sum( actual.values()) - merged['NoIndex'] = {'cluster_count': total_barcodes} + total_barcodes = sum(actual.values()) + merged["NoIndex"] = {"cluster_count": total_barcodes} for field in fields: - merged['NoIndex'][field] = expected[None][field] + merged["NoIndex"][field] = expected[None][field] else: for barcode in actual.keys(): - merged[barcode] = { 'cluster_count': actual[barcode] } + merged[barcode] = {"cluster_count": actual[barcode]} if barcode in expected: for field in fields: merged[barcode][field] = expected[barcode][field] return merged + def print_stats_txt(stats, threshold): for lane in stats.keys(): lane_stats = stats[lane] - print "====== Lane %d =====" % (lane) - for barcode in sorted(lane_stats.keys(), key = lambda x: lane_stats[x]['cluster_count'], reverse = True): + print("====== Lane %d =====" % (lane)) + for barcode in sorted( + lane_stats.keys(), + key=lambda x: lane_stats[x]["cluster_count"], + reverse=True, + ): m = lane_stats[barcode] - if (m['cluster_count'] >= threshold) or ('samplesheet_name' in m): - print "%s,%d,%s,%s" % (barcode, m['cluster_count'], m.get('samplesheet_name',''), m.get('purpose','')) - print "" + if (m["cluster_count"] >= threshold) or ("samplesheet_name" in m): + print( + "%s,%d,%s,%s" + % ( + barcode, + m["cluster_count"], + m.get("samplesheet_name", ""), + m.get("purpose", ""), + ) + ) + print("") + def print_stats_json(stats, threshold): for lane in stats.keys(): - stats[lane] = { k: v for k, v in stats[lane].iteritems() - if v['cluster_count'] >= threshold or 'samplesheet_name' in v } - print json.dumps(stats, sort_keys=True, indent=2) + stats[lane] = { + k: v + for k, v in stats[lane].iteritems() + if v["cluster_count"] >= threshold or "samplesheet_name" in v + } + print(json.dumps(stats, sort_keys=True, indent=2)) -def main(args = sys.argv): + +def main(args=sys.argv): parser = parser_setup() poptions = parser.parse_args() @@ -129,20 +177,24 @@ def main(args = sys.argv): process_json = open(poptions.processing_file) processing_data = json.load(process_json) process_json.close() - expected = get_expected_barcodes( processing_data ) + expected = get_expected_barcodes(processing_data) if poptions.lane: lanes = [poptions.lane] else: - lanes = sorted(list(set([l['lane'] for l in processing_data['libraries']]))) + lanes = sorted(list(set([l["lane"] for l in processing_data["libraries"]]))) # Get actual barcodes and merge with expected compiled_stats = {} for lane in lanes: - barcode_files = get_input_files_for_lane(processing_data, lane, poptions.basedir) + barcode_files = get_input_files_for_lane( + processing_data, lane, poptions.basedir + ) actual_barcodes = sum_barcodes(barcode_files) - - compiled_stats[lane] = merge_actual_and_expected(expected[lane], actual_barcodes) + + compiled_stats[lane] = merge_actual_and_expected( + expected[lane], actual_barcodes + ) # Print out if poptions.json: @@ -150,5 +202,6 @@ def main(args = sys.argv): else: print_stats_txt(compiled_stats, poptions.threshold) + if __name__ == "__main__": main() diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py index eee36ece..16b4327a 100644 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -19,38 +19,58 @@ lengths = set([]) + def parseArgs(): - parser = argparse.ArgumentParser( - description='Split up fastq files by barcode') - parser.add_argument('--mismatches', type=int, default=0, - help='number of mismatches') - parser.add_argument('--processing', dest='processing_file', required=True, - help='processing.json to use (mandatory)') - parser.add_argument('--suffix', dest='suffix', default='', - help='suffix to add to sample names') - parser.add_argument('--lane', dest='lane', type=int, - default=1, help='Lane to process (default 1)') - parser.add_argument('--autosuffix', action="store_true", - default=False, help='Automatically guess a suffix name') - parser.add_argument('--outdir', dest='outdir', - default='.', help='Output directory') - parser.add_argument('--dry-run', dest='dry_run', default=False, - action='store_true', help='Do not actually demultiplex') - parser.add_argument('--ignore_failed_lanes', default=False, action="store_true", - help="Ignore any lanes marked as failed in processing") - parser.add_argument('--debug', dest="debug", action="store_true", - help="Output debugging info") - parser.add_argument('infile', nargs='+') + parser = argparse.ArgumentParser(description="Split up fastq files by barcode") + parser.add_argument( + "--mismatches", type=int, default=0, help="number of mismatches" + ) + parser.add_argument( + "--processing", + dest="processing_file", + required=True, + help="processing.json to use (mandatory)", + ) + parser.add_argument( + "--suffix", dest="suffix", default="", help="suffix to add to sample names" + ) + parser.add_argument( + "--lane", dest="lane", type=int, default=1, help="Lane to process (default 1)" + ) + parser.add_argument( + "--autosuffix", + action="store_true", + default=False, + help="Automatically guess a suffix name", + ) + parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") + parser.add_argument( + "--dry-run", + dest="dry_run", + default=False, + action="store_true", + help="Do not actually demultiplex", + ) + parser.add_argument( + "--ignore_failed_lanes", + default=False, + action="store_true", + help="Ignore any lanes marked as failed in processing", + ) + parser.add_argument( + "--debug", dest="debug", action="store_true", help="Output debugging info" + ) + parser.add_argument("infile", nargs="+") args = parser.parse_args() return args def mismatch(word, mismatches): - """ Generator for mismatches + """Generator for mismatches returns original string + whatever variations have at most - [mismatches] hamming distance """ - for d in range(mismatches+1): + [mismatches] hamming distance""" + for d in range(mismatches + 1): for locs in itertools.combinations(range(len(word)), d): this_word = [[char] for char in word] for loc in locs: @@ -63,7 +83,8 @@ def mismatch(word, mismatches): def guess_suffix(filename): file = os.path.basename(filename) - nextseq_format = re.compile(r""" + nextseq_format = re.compile( + r""" ^ # Start Undetermined # _ S0 # index @@ -74,14 +95,17 @@ def guess_suffix(filename): # End suffix .fastq.gz $ - """, re.X) + """, + re.X, + ) match = nextseq_format.search(file) if match: - suffix = "_R%s_%s" % (match.group('read'), match.group('lane')) + suffix = "_R%s_%s" % (match.group("read"), match.group("lane")) return suffix - hiseq_format = re.compile(r""" + hiseq_format = re.compile( + r""" ^ # Start lane \d+ _ # lane number Undetermined # @@ -92,56 +116,60 @@ def guess_suffix(filename): ) # End suffix .fastq.gz $ - """, re.X) + """, + re.X, + ) match = hiseq_format.search(file) if match: - return match.group('suffix') + return match.group("suffix") return "" -def parse_processing_file(file, mismatches, suffix, lane, outdir, ignore_failed_lanes=False): - """ Decode processing file into barcode->file assignments """ +def parse_processing_file( + file, mismatches, suffix, lane, outdir, ignore_failed_lanes=False +): + """Decode processing file into barcode->file assignments""" barcodes = {} labels = {} with open(file) as data_file: data = json.load(data_file) - run_type = data['flowcell']['run_type'] - index_len = data['flowcell']['index_length'] + run_type = data["flowcell"]["run_type"] + index_len = data["flowcell"]["index_length"] # Only some flowcell types need to treat different lanes differently if run_type == "NextSeq 500": - lane_libraries = data['libraries'] + lane_libraries = data["libraries"] elif run_type == "HISEQ V4": - lane_libraries = [lib for lib in data['libraries'] if lib['lane'] == lane] + lane_libraries = [lib for lib in data["libraries"] if lib["lane"] == lane] elif run_type == "HiSeq 4000": - lane_libraries = [lib for lib in data['libraries'] if lib['lane'] == lane] + lane_libraries = [lib for lib in data["libraries"] if lib["lane"] == lane] # TODO: Is this always correct? elif run_type.startswith("Novaseq 6000"): - lane_libraries = [lib for lib in data['libraries'] if lib['lane'] == lane] + lane_libraries = [lib for lib in data["libraries"] if lib["lane"] == lane] else: - logging.warn( - "Run type %s not supported; using all libraries" % run_type) - lane_libraries = data['libraries'] + logging.warn("Run type %s not supported; using all libraries" % run_type) + lane_libraries = data["libraries"] for library in lane_libraries: - - if library.get('alignments', []): - label = library['alignments'][0]['sample_name'] + if library.get("alignments", []): + label = library["alignments"][0]["sample_name"] else: label = "%s_%s_L%03d" % ( - library['samplesheet_name'], library['barcode_index'], library['lane']) + library["samplesheet_name"], + library["barcode_index"], + library["lane"], + ) if ignore_failed_lanes and library["failed"]: logging.info("Ignoring failed library %s" % label) continue - project_dir = "Project_%s" % library['project'] - sample_dir = "Sample_%s" % library['samplesheet_name'] + project_dir = "Project_%s" % library["project"] + sample_dir = "Sample_%s" % library["samplesheet_name"] library_dir = os.path.join(outdir, project_dir, sample_dir) - outfile_name = os.path.join( - library_dir, "%s%s.fastq.gz" % (label, suffix)) + outfile_name = os.path.join(library_dir, "%s%s.fastq.gz" % (label, suffix)) try: os.makedirs(library_dir) @@ -149,7 +177,7 @@ def parse_processing_file(file, mismatches, suffix, lane, outdir, ignore_failed_ if exception.errno != errno.EEXIST: raise - barcode_indices = library['barcode_index'].split("-") + barcode_indices = library["barcode_index"].split("-") barcode1 = barcode_indices[0] barcode2 = barcode_indices[1] if len(barcode_indices) > 1 else "" @@ -168,27 +196,32 @@ def parse_processing_file(file, mismatches, suffix, lane, outdir, ignore_failed_ # TODO: This can be smarter if barcode in barcodes: logging.error( - "Barcode %s already taken, lower --mismatches! (taken by %s+%s)" % (barcode, barcode1, barcode2)) + "Barcode %s already taken, lower --mismatches! (taken by %s+%s)" + % (barcode, barcode1, barcode2) + ) sys.exit(1) barcodes[barcode] = label labels[label] = {"filtered": 0, "unfiltered": 0, "total": 0} # TODO: Warning! this will overwrite files! - outfile = open(outfile_name, 'wb') + outfile = open(outfile_name, "wb") labels[label]["fh"] = outfile labels[label]["out"] = subprocess.Popen( - ['gzip', '-7'], stdout=outfile, stdin=subprocess.PIPE) + ["gzip", "-7"], stdout=outfile, stdin=subprocess.PIPE + ) - logging.info("Mapping %d barcodes to %s libraries" % - (len(barcodes), len(lane_libraries))) + logging.info( + "Mapping %d barcodes to %s libraries" % (len(barcodes), len(lane_libraries)) + ) logging.debug(barcodes) return barcodes, labels def split_file(filename, barcodes, labels): - """ Demultiplex a file """ - barcode_re = re.compile(r""" + """Demultiplex a file""" + barcode_re = re.compile( + r""" [012]: # ([YN]): # Fail/pass chastity filtering [01]: # @@ -196,17 +229,21 @@ def split_file(filename, barcodes, labels): \+? # Optional separator (+) ( [AGCTN] {6,20} )? # Optionally, second barcode $ - """, re.X) + """, + re.X, + ) tally = 0 logging.info("Demultiplexing file: %s" % filename) - if filename.endswith('.gz'): + if filename.endswith(".gz"): parsein = subprocess.Popen( - ['zcat', filename], stdout=subprocess.PIPE, universal_newlines=True) + ["zcat", filename], stdout=subprocess.PIPE, universal_newlines=True + ) else: parsein = subprocess.Popen( - ['cat', filename], stdout=subprocess.PIPE, universal_newlines=True) + ["cat", filename], stdout=subprocess.PIPE, universal_newlines=True + ) for record, seq, qual in FastqGeneralIterator(parsein.stdout): tally += 1 @@ -229,27 +266,26 @@ def split_file(filename, barcodes, labels): barcode2 = "" matched = False for fmt in lengths: - barcode = (barcode1[:fmt[0]], barcode2[:fmt[1]]) + barcode = (barcode1[: fmt[0]], barcode2[: fmt[1]]) if barcode in barcodes: label = barcodes[barcode] matched = True break if matched: - labels[label]['total'] += 1 + labels[label]["total"] += 1 # Replace recorded barcode - sep_index = record.rfind(':') - record = record[:sep_index + 1] + barcode1 + "+" + barcode2 + sep_index = record.rfind(":") + record = record[: sep_index + 1] + barcode1 + "+" + barcode2 # write to FASTQ - text = bytes('@' + record + '\n' + seq + - '\n+\n' + qual + '\n', 'UTF-8') - labels[label]['out'].stdin.write(text) + text = bytes("@" + record + "\n" + seq + "\n+\n" + qual + "\n", "UTF-8") + labels[label]["out"].stdin.write(text) if filter_str == "Y": labels[label]["filtered"] += 1 else: - labels[label]['unfiltered'] += 1 + labels[label]["unfiltered"] += 1 # Wait for all subprocesses to finish parsein.communicate() @@ -275,7 +311,13 @@ def main(argv): logging.info("--autosuffix, guessing suffix as %s" % args.suffix) barcodes, labels = parse_processing_file( - args.processing_file, args.mismatches, args.suffix, args.lane, args.outdir, ignore_failed_lanes=args.ignore_failed_lanes) + args.processing_file, + args.mismatches, + args.suffix, + args.lane, + args.outdir, + ignore_failed_lanes=args.ignore_failed_lanes, + ) if args.dry_run: logging.info("Dry run, exiting") @@ -290,8 +332,8 @@ def main(argv): print("%s\t%s" % (label, str(info))) for label, info in labels.items(): - info['out'].communicate() - info['fh'].close() + info["out"].communicate() + info["fh"].close() if __name__ == "__main__": diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index f9aedfe9..78bccf74 100644 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -21,7 +21,7 @@ def parser_setup(): - """ Sets up parser """ + """Sets up parser""" parser = argparse.ArgumentParser() @@ -72,8 +72,15 @@ def parser_setup(): def create_links( - lane, read, input_basedir, output_basedir, dry_run=False, undetermined=False, is_pool=False, merge_across_lanes=False, - ): + lane, + read, + input_basedir, + output_basedir, + dry_run=False, + undetermined=False, + is_pool=False, + merge_across_lanes=False, +): """ Create the links between the input directories and output dir If dry_run is passed, will print them instead of creating them @@ -95,7 +102,6 @@ def create_links( else: sample_name = "%s_L%03d" % (short_name, lane_num) - if lane.get("library_pool"): is_pool = True @@ -112,7 +118,11 @@ def create_links( ) short_name = re.sub(r"_", "-", short_name) - lane_lane = "*" if (merge_across_lanes or "lane" not in lane) else "_L%03d" % int(lane["lane"]) + lane_lane = ( + "*" + if (merge_across_lanes or "lane" not in lane) + else "_L%03d" % int(lane["lane"]) + ) input_wildcard = os.path.join( input_basedir, "%s_S*%s_%s_???.fastq.gz" % (short_name, lane_lane, read) ) @@ -159,8 +169,22 @@ def main(): data = json.loads(open(poptions.processing_file, "r").read()) for lane in data["libraries"]: - create_links(lane, "R1", input_dir, poptions.output_dir, poptions.dry_run, merge_across_lanes=poptions.merge_across_lanes) - create_links(lane, "R2", input_dir, poptions.output_dir, poptions.dry_run, merge_across_lanes=poptions.merge_across_lanes) + create_links( + lane, + "R1", + input_dir, + poptions.output_dir, + poptions.dry_run, + merge_across_lanes=poptions.merge_across_lanes, + ) + create_links( + lane, + "R2", + input_dir, + poptions.output_dir, + poptions.dry_run, + merge_across_lanes=poptions.merge_across_lanes, + ) undet_lane = { "alignments": [{"sample_name": "lane1_Undetermined_L001"}], @@ -168,22 +192,27 @@ def main(): } for read in ["R1", "R2"]: create_links( - undet_lane, read, input_dir, poptions.output_dir, poptions.dry_run, undetermined=True, + undet_lane, + read, + input_dir, + poptions.output_dir, + poptions.dry_run, + undetermined=True, merge_across_lanes=poptions.merge_across_lanes, ) # Set up conversion table libs_to_lanes = defaultdict(set) for lane in data["libraries"]: - libs_to_lanes[lane['library']].add(lane['lane']) + libs_to_lanes[lane["library"]].add(lane["lane"]) - for (pool, info) in data["library_pools"].items(): + for pool, info in data["library_pools"].items(): barcode = info["barcode1"] if info.get("barcode2"): barcode = "%s_%s" % (barcode, info["barcode2"]) lane_nums = set() for lib in info["libraries"]: - lib_num = int(re.sub(r'[^\d]+', '', lib)) + lib_num = int(re.sub(r"[^\d]+", "", lib)) lane_nums.update(libs_to_lanes[lib_num]) for lane_num in sorted(lane_nums): @@ -195,7 +224,13 @@ def main(): } for read in ["R1", "R2"]: create_links( - lane, read, input_dir, poptions.output_dir, poptions.dry_run, is_pool=True, merge_across_lanes=poptions.merge_across_lanes, + lane, + read, + input_dir, + poptions.output_dir, + poptions.dry_run, + is_pool=True, + merge_across_lanes=poptions.merge_across_lanes, ) diff --git a/scripts/flowcells/link_rapidrun.py b/scripts/flowcells/link_rapidrun.py index ee0a50a0..fff50909 100644 --- a/scripts/flowcells/link_rapidrun.py +++ b/scripts/flowcells/link_rapidrun.py @@ -17,63 +17,84 @@ "dry_run": False, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-b", "--base_dir", dest="base_dir", - help="The base directory of the flowcell.") - parser.add_argument("-p", "--processing_file", dest="processing_file", - help="The processing_file to use as a guide.") - - parser.add_argument("--dry-run", dest="dry_run", action="store_true", - help="Only print out planned symlinks.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-b", "--base_dir", dest="base_dir", help="The base directory of the flowcell." + ) + parser.add_argument( + "-p", + "--processing_file", + dest="processing_file", + help="The processing_file to use as a guide.", + ) + + parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + help="Only print out planned symlinks.", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -def create_links(lane, read, base_dir, dry_run = False): +def create_links(lane, read, base_dir, dry_run=False): alignment = lane["alignments"][0] - - sample_dir = os.path.join( base_dir, "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"] ) + + sample_dir = os.path.join( + base_dir, "Project_%s" % lane["project"], "Sample_%s" % lane["samplesheet_name"] + ) sample_name = alignment["sample_name"] - - print "\nCreating links for %s %s\n" % (sample_name, read) - + + print("\nCreating links for %s %s\n" % (sample_name, read)) + os.chdir(sample_dir) - + lane1_fastq = glob.glob("%s_%s_*.fastq.gz" % (sample_name, read)) - + replace = re.compile(r"_L001$") - L2_sample_name = replace.sub('_L002', sample_name ) - + L2_sample_name = replace.sub("_L002", sample_name) + lane2_fastq = glob.glob("%s_%s_*.fastq.gz" % (L2_sample_name, read)) - + lane1_filecount = len(lane1_fastq) lane2_filecount = len(lane2_fastq) - - for lane2_filenum in range(1, lane2_filecount+1): - effective_filenum = lane1_filecount + lane2_filenum + + for lane2_filenum in range(1, lane2_filecount + 1): + effective_filenum = lane1_filecount + lane2_filenum orig_filename = "%s_%s_%03d.fastq.gz" % (L2_sample_name, read, lane2_filenum) new_filename = "%s_%s_%03d.fastq.gz" % (sample_name, read, effective_filenum) - - print "Linking %s => %s" % (orig_filename, new_filename) - + + print("Linking %s => %s" % (orig_filename, new_filename)) + if not dry_run: os.symlink(orig_filename, new_filename) - -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -88,16 +109,15 @@ def main(args = sys.argv): base_dir = poptions.base_dir - p = json.loads(open(poptions.processing_file, 'r').read()) + p = json.loads(open(poptions.processing_file, "r").read()) - for lane in p['libraries']: + for lane in p["libraries"]: create_links(lane, "R1", base_dir, poptions.dry_run) create_links(lane, "R2", base_dir, poptions.dry_run) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it if __name__ == "__main__": main() - - diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index eea44fa9..7ce0a8db 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -23,21 +23,38 @@ "filename": "SampleSheet.withmask.{mask}.csv", } + def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-p", "--processing", dest="processing", - help="The JSON file to read barcodes from (default: processing.json)") - parser.add_argument("--reverse_barcode1", dest="reverse_barcode1", action="store_true", - help="Use reverse sequence for barcode1") - parser.add_argument("--reverse_barcode2", dest="reverse_barcode2", action="store_true", - help="Use reverse sequence for barcode2") - parser.add_argument("--filename", - help="The template to use for filename, with the {mask} formatting") + parser.add_argument( + "-p", + "--processing", + dest="processing", + help="The JSON file to read barcodes from (default: processing.json)", + ) + parser.add_argument( + "--reverse_barcode1", + dest="reverse_barcode1", + action="store_true", + help="Use reverse sequence for barcode1", + ) + parser.add_argument( + "--reverse_barcode2", + dest="reverse_barcode2", + action="store_true", + help="Use reverse sequence for barcode2", + ) + parser.add_argument( + "--filename", + help="The template to use for filename, with the {mask} formatting", + ) parser.set_defaults(**SCRIPT_OPTIONS) return parser -def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2: bool) -> "[dict]": +def get_barcode_assignments( + data: dict, reverse_barcode1: bool, reverse_barcode2: bool +) -> "[dict]": assignments = [] # This will store our pool samplesheet lines @@ -53,10 +70,18 @@ def get_barcode_assignments(data: dict, reverse_barcode1: bool, reverse_barcode2 if assignment["sample"] == "None": assignment["sample"] = "LANE%d" % libdata["id"] if libdata.get("barcode1") is not None: - assignment["barcode1"] = libdata["barcode1"]["reverse_sequence"] if reverse_barcode1 else libdata["barcode1"]["sequence"] + assignment["barcode1"] = ( + libdata["barcode1"]["reverse_sequence"] + if reverse_barcode1 + else libdata["barcode1"]["sequence"] + ) if libdata.get("barcode2") is not None: - assignment["barcode2"] = libdata["barcode2"]["reverse_sequence"] if reverse_barcode2 else libdata["barcode2"]["sequence"] - + assignment["barcode2"] = ( + libdata["barcode2"]["reverse_sequence"] + if reverse_barcode2 + else libdata["barcode2"]["sequence"] + ) + assignments.append(assignment) return assignments @@ -80,8 +105,9 @@ def make_samplesheet_header(name: str, date: str) -> str: def group_assignments(assignments: "[dict]") -> "[[dict]]": - """ Groups the barcode assignments by length """ + """Groups the barcode assignments by length""" barcode_length_combinations = defaultdict(list) + def get_len(d): return 0 if d is None else len(d) @@ -93,10 +119,11 @@ def get_len(d): barcode_length_combinations[key].append(assignment) return barcode_length_combinations + def parse_mask(mask: str) -> "[[(str, int)]]": parts = [] str_parts = mask.split(",") - regex = r'(?P[yni])(?P[0-9]*)' + regex = r"(?P[yni])(?P[0-9]*)" for part in str_parts: pieces = [] for match in re.finditer(regex, part, flags=re.I): @@ -111,8 +138,10 @@ def parse_mask(mask: str) -> "[[(str, int)]]": parts.append(pieces) return parts + def mask_to_str(mask: "[[(str, int)]]") -> str: - """ Convert a mask in parts back into a string """ + """Convert a mask in parts back into a string""" + def format_piece(letter, num): if num == 0: return "" @@ -121,13 +150,9 @@ def format_piece(letter, num): else: return letter + str(num) - return ",".join([ - "".join([ - format_piece(*piece) - for piece in part - ]) - for part in mask - ]) + return ",".join( + ["".join([format_piece(*piece) for piece in part]) for part in mask] + ) def adjust_mask_for_lengths(mask_parts, len1, len2): @@ -142,7 +167,11 @@ def adjust_mask_for_lengths(mask_parts, len1, len2): is_index_read = any(piece[0] == "i" for piece in read) if is_index_read: if any(piece[0] == "y" for piece in read): - raise Exception("Mixed read/index in barcode mask '{}', don't know how to deal with this".format(mask_to_str(mask_parts))) + raise Exception( + "Mixed read/index in barcode mask '{}', don't know how to deal with this".format( + mask_to_str(mask_parts) + ) + ) index_reads_seen += 1 if index_reads_seen == 1: # first barcode @@ -166,7 +195,7 @@ def adjust_mask_for_lengths(mask_parts, len1, len2): def write_samplesheets(name, filename_template, date, root_mask, assignments): - """ Write out the sample sheets """ + """Write out the sample sheets""" mask_parts = parse_mask(root_mask) max_bclen1 = 0 max_bclen2 = 0 @@ -182,37 +211,45 @@ def write_samplesheets(name, filename_template, date, root_mask, assignments): max_bclen2 = read_len for assign in assignments: - assign['barcode1'] = assign['barcode1'][:max_bclen1] - assign['barcode2'] = assign['barcode2'][:max_bclen2] + assign["barcode1"] = assign["barcode1"][:max_bclen1] + assign["barcode2"] = assign["barcode2"][:max_bclen2] # Trim barcodes to make sure they fit in the read groups = group_assignments(assignments) - for (barcode_lengths, assigns) in groups.items(): + for barcode_lengths, assigns in groups.items(): new_mask = adjust_mask_for_lengths(mask_parts, *barcode_lengths) header = make_samplesheet_header(name, date) body = make_samplesheet_body(assigns) samplesheet_contents = header + body filename = filename_template.format(mask=mask_to_str(new_mask)) - print("Writing {filename} with {new_mask}".format(filename=filename, new_mask=mask_to_str(new_mask))) + print( + "Writing {filename} with {new_mask}".format( + filename=filename, new_mask=mask_to_str(new_mask) + ) + ) with open(filename, "w") as f: f.write(samplesheet_contents) + def make_samplesheet_body(barcode_assignments: "[dict]") -> str: - """ Create samplesheet text from assignments """ + """Create samplesheet text from assignments""" lines = [] for ba in barcode_assignments: - line = ",".join([ - str(ba["lane"]), - ba["sample"], - ba["sample"], - str(ba["barcode1"]), - str(ba["barcode2"]), - "", - ]) + line = ",".join( + [ + str(ba["lane"]), + ba["sample"], + ba["sample"], + str(ba["barcode1"]), + str(ba["barcode2"]), + "", + ] + ) lines.append(line) return "\n".join(sorted(lines)) + def main(args=sys.argv): parser = parser_setup() poptions = parser.parse_args() @@ -221,16 +258,19 @@ def main(args=sys.argv): data = json.load(process_json) process_json.close() - assignments = get_barcode_assignments(data, - poptions.reverse_barcode1, - poptions.reverse_barcode2, - ) + assignments = get_barcode_assignments( + data, + poptions.reverse_barcode1, + poptions.reverse_barcode2, + ) mask = data["alignment_group"]["bases_mask"] - write_samplesheets(name="Altius", - filename_template=poptions.filename, - date=str(datetime.date.today()), - root_mask=mask, - assignments=assignments) + write_samplesheets( + name="Altius", + filename_template=poptions.filename, + date=str(datetime.date.today()), + root_mask=mask, + assignments=assignments, + ) if __name__ == "__main__": diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index 3bda907e..719be522 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -14,26 +14,40 @@ import json MAX_MISMATCH_LEVEL = 1 # Nextseq can allow 2, Hiseq 2500 allows only 1 -POSSIBLE_MISMATCH_LEVELS = range( MAX_MISMATCH_LEVEL, -1, -1 ) +POSSIBLE_MISMATCH_LEVELS = range(MAX_MISMATCH_LEVEL, -1, -1) + +script_options = {"processing": "processing.json"} -script_options = { - "processing": "processing.json" -} def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-p", "--processing", dest="processing", - help="The JSON file to read barcodes from") - parser.add_argument("--ignore_failed_lanes", dest="ignore_failed_lanes", action="store_true", default=False, - help="Ignore failed lanes when calculating max mismatch.") - parser.add_argument("--allow_collisions", dest="allow_collisions", action="store_true", default=False, - help="Don't exit with error even if collisions are found (workaround)") - - parser.set_defaults( **script_options ) + parser.add_argument( + "-p", + "--processing", + dest="processing", + help="The JSON file to read barcodes from", + ) + parser.add_argument( + "--ignore_failed_lanes", + dest="ignore_failed_lanes", + action="store_true", + default=False, + help="Ignore failed lanes when calculating max mismatch.", + ) + parser.add_argument( + "--allow_collisions", + dest="allow_collisions", + action="store_true", + default=False, + help="Don't exit with error even if collisions are found (workaround)", + ) + + parser.set_defaults(**script_options) return parser + def gen_snps(word, mismatches): - for d in range(mismatches+1): + for d in range(mismatches + 1): for locs in itertools.combinations(range(len(word)), d): thisWord = [[char] for char in word] for loc in locs: @@ -42,10 +56,17 @@ def gen_snps(word, mismatches): for poss in itertools.product(*thisWord): yield "".join(poss) + def generate_barcodes(barcode_tuple, mismatch_tuple): - return set(itertools.product(* [ - gen_snps( barcode_tuple[i], mismatch_tuple[i] ) - for i in range(len(barcode_tuple)) ])) + return set( + itertools.product( + *[ + gen_snps(barcode_tuple[i], mismatch_tuple[i]) + for i in range(len(barcode_tuple)) + ] + ) + ) + def is_mismatch_level_okay(barcodes, mismatch_tuple): # If there's one or fewer barcodes in a lane, any number of mismatches is okay @@ -57,61 +78,69 @@ def is_mismatch_level_okay(barcodes, mismatch_tuple): if len(barcodes) == 1: return True for barcode in barcodes: - new_barcodes = generate_barcodes( barcode, mismatch_tuple ) - if barcode_collection.isdisjoint( new_barcodes ): + new_barcodes = generate_barcodes(barcode, mismatch_tuple) + if barcode_collection.isdisjoint(new_barcodes): barcode_collection.update(new_barcodes) else: return False return True + def get_max_mismatch_level(lane_set, index_count): - mismatch_choices = list( itertools.product( POSSIBLE_MISMATCH_LEVELS, repeat=index_count)) - lanes = lane_set.values() # We don't actually care about lane labels (the key) + mismatch_choices = list( + itertools.product(POSSIBLE_MISMATCH_LEVELS, repeat=index_count) + ) + lanes = lane_set.values() # We don't actually care about lane labels (the key) for choice in mismatch_choices: - no_collisions = all( [ - is_mismatch_level_okay(barcodes, choice) - for barcodes in lanes ] ) + no_collisions = all( + [is_mismatch_level_okay(barcodes, choice) for barcodes in lanes] + ) if no_collisions: return choice return None + # Takes in mask & barcode, returns list of trimmed barcodes def apply_mask(mask, barcode_string): if barcode_string is None: - barcode_string = u'' - orig_barcodes = barcode_string.split('-') + barcode_string = "" + orig_barcodes = barcode_string.split("-") while len(orig_barcodes) < len(mask): - orig_barcodes.append(u'') - barcodes = [ orig_barcodes[i][:l] for (i, l) in enumerate(mask) ] + orig_barcodes.append("") + barcodes = [orig_barcodes[i][:l] for (i, l) in enumerate(mask)] return barcodes + def create_lane_set(libraries, mask, ignore_failed_lanes, allow_collision=False): lanes = {} for library in libraries: - lane = library['lane'] + lane = library["lane"] # don't count failed lanes in barcode checking if ignore_failed_lanes and library["failed"]: continue - barcodes = tuple(apply_mask(mask, library['barcode_index'])) + barcodes = tuple(apply_mask(mask, library["barcode_index"])) if lane not in lanes: lanes[lane] = set() if barcodes in lanes[lane]: if not allow_collision: - sys.stderr.write("Collision on lane %d, barcode %s\n" % ( lane, ','.join(barcodes))) + sys.stderr.write( + "Collision on lane %d, barcode %s\n" % (lane, ",".join(barcodes)) + ) sys.exit(1) lanes[lane].add(barcodes) return lanes + # NB: This assumes that all index reads will start with an i, and be followed by one or more digits # e.g: i6n will work, but iiiiiin, i2n3i2, or ni6 will not. def parse_bases_mask(mask_string): - mask = map(int, re.findall( r"""(?: i ( \d* ) )""", mask_string, re.X | re.I)) + mask = map(int, re.findall(r"""(?: i ( \d* ) )""", mask_string, re.X | re.I)) return mask -def main(args = sys.argv): +def main(args=sys.argv): parser = parser_setup() poptions = parser.parse_args() @@ -119,7 +148,7 @@ def main(args = sys.argv): data = json.load(process_json) process_json.close() - mask_string = data['alignment_group']['bases_mask'] + mask_string = data["alignment_group"]["bases_mask"] mask = list(parse_bases_mask(mask_string)) # If the flowcell has no index, exit. @@ -128,9 +157,11 @@ def main(args = sys.argv): print("1") sys.exit(0) - lanes = create_lane_set(data['libraries'], mask, poptions.ignore_failed_lanes, poptions.allow_collisions) + lanes = create_lane_set( + data["libraries"], mask, poptions.ignore_failed_lanes, poptions.allow_collisions + ) - mismatch_level = get_max_mismatch_level( lanes, len(mask) ) + mismatch_level = get_max_mismatch_level(lanes, len(mask)) if not mismatch_level and not poptions.allow_collisions: sys.stderr.write("No allowable mismatch levels found, barcode collision?\n") @@ -138,5 +169,6 @@ def main(args = sys.argv): print(",".join(map(str, mismatch_level))) + if __name__ == "__main__": main() diff --git a/scripts/flowcells/test_barcode_masks.py b/scripts/flowcells/test_barcode_masks.py index e79d9fb7..65da9e52 100644 --- a/scripts/flowcells/test_barcode_masks.py +++ b/scripts/flowcells/test_barcode_masks.py @@ -6,37 +6,43 @@ from barcode_masks import get_barcode_masks -@pytest.mark.parametrize("_name,read_len,index_len,lib_lengths,expected", [ - ("basic-paired", 75, 8, [(8, 8)], ["y75,i8,i8,y75"]), - ("toolong-paired", 75, 10, [(8, 8)], ["y75,i8n2,i8n2,y75"]), - ("tooshort-paired", 75, 8, [(10, 10)], ["y75,i8,i8,y75"]), - ("mixed-paired", 75, 8, [(8, 8), (8, 0)], ["y75,i8,i8,y75", "y75,i8,n8,y75"]), -]) -def test_expected_index_masks( - _name, read_len, index_len, lib_lengths, expected -): - """ Run some table-driven tests to make sure we get the right output """ +@pytest.mark.parametrize( + "_name,read_len,index_len,lib_lengths,expected", + [ + ("basic-paired", 75, 8, [(8, 8)], ["y75,i8,i8,y75"]), + ("toolong-paired", 75, 10, [(8, 8)], ["y75,i8n2,i8n2,y75"]), + ("tooshort-paired", 75, 8, [(10, 10)], ["y75,i8,i8,y75"]), + ("mixed-paired", 75, 8, [(8, 8), (8, 0)], ["y75,i8,i8,y75", "y75,i8,n8,y75"]), + ], +) +def test_expected_index_masks(_name, read_len, index_len, lib_lengths, expected): + """Run some table-driven tests to make sure we get the right output""" data = make_processing_json(read_len, index_len, lib_lengths) actual = get_barcode_masks(data) assert set(actual) == set(expected) def gen_barcode(length: int) -> str: - """ Generates a random string of letters of length 'length' """ - return "".join( - [random.choice(['A', 'C', 'T', 'G']) for _ in range(length)] - ) + """Generates a random string of letters of length 'length'""" + return "".join([random.choice(["A", "C", "T", "G"]) for _ in range(length)]) -def make_processing_json(read_len: int, - index_len: int, - lib_index_lengths: List[Tuple[int, int]], - ) -> dict: - """ Creates a minimal "processing" data structure """ +def make_processing_json( + read_len: int, + index_len: int, + lib_index_lengths: List[Tuple[int, int]], +) -> dict: + """Creates a minimal "processing" data structure""" return { - "flowcell": {"read_length": read_len, "index_length": index_len, }, - "libraries": [{ - "barcode1": {"sequence": gen_barcode(bc1)}, - "barcode2": {"sequence": gen_barcode(bc2)}, - } for (bc1, bc2) in lib_index_lengths] + "flowcell": { + "read_length": read_len, + "index_length": index_len, + }, + "libraries": [ + { + "barcode1": {"sequence": gen_barcode(bc1)}, + "barcode2": {"sequence": gen_barcode(bc2)}, + } + for (bc1, bc2) in lib_index_lengths + ], } diff --git a/scripts/helpers/expand_multiple_alignments.py b/scripts/helpers/expand_multiple_alignments.py index 7715eb27..53170e27 100755 --- a/scripts/helpers/expand_multiple_alignments.py +++ b/scripts/helpers/expand_multiple_alignments.py @@ -3,6 +3,7 @@ import sys import copy + def process_line(line): cols = line.strip().split("\t") contains_duplicates = False @@ -13,24 +14,24 @@ def process_line(line): break sys.stdout.write(line) if contains_duplicates: - aligns = col.split(":")[2].split(';') + aligns = col.split(":")[2].split(";") i = 1 for align in aligns: if align: i += 1 new_line = copy.copy(cols) fields = align.split(",") - #for field in fields: - #print("field", field) - new_line[0] = cols[0] + "_" + str(i) # read name - new_line[1] = int(new_line[1]) | 2 & (~512) # flag - new_line[2] = fields[0] # chrom - new_line[3] = abs(int(fields[1])) # pos - new_line[4] = "30" # quality - new_line[5] = fields[2] # cigar - new_line[6] = cols[6] # pair chrom - new_line[7] = '???' # pair pos - new_line[8] = cols[8] # template length + # for field in fields: + # print("field", field) + new_line[0] = cols[0] + "_" + str(i) # read name + new_line[1] = int(new_line[1]) | 2 & (~512) # flag + new_line[2] = fields[0] # chrom + new_line[3] = abs(int(fields[1])) # pos + new_line[4] = "30" # quality + new_line[5] = fields[2] # cigar + new_line[6] = cols[6] # pair chrom + new_line[7] = "???" # pair pos + new_line[8] = cols[8] # template length # Reverse strand if "-" in fields[1]: @@ -41,6 +42,7 @@ def process_line(line): new_line = [str(n) for n in new_line] sys.stdout.write("\t".join(new_line) + "\n") + def get_secondary_aligns(read): contains_duplicates = False cols = read.split("\t") @@ -50,20 +52,25 @@ def get_secondary_aligns(read): break if not contains_duplicates: return [] - aligns = col.split(":")[2].split(';')[:-1] - #print(aligns) + aligns = col.split(":")[2].split(";")[:-1] + # print(aligns) return sorted( - [Alignment(*a.split(',')[:-1]) for a in aligns], + [Alignment(*a.split(",")[:-1]) for a in aligns], ) + from collections import namedtuple -Alignment = namedtuple('Alignment', 'chr pos cigar') + +Alignment = namedtuple("Alignment", "chr pos cigar") + + # Returns an array of tuples -# Each tuple +# Each tuple def get_aligns(read): cols = read.split("\t") return [Alignment(cols[2], cols[3], cols[5])] + get_secondary_aligns(read) + def process_pair(r1, r2): s1 = get_secondary_aligns(r1) s2 = get_secondary_aligns(r2) @@ -93,12 +100,7 @@ def valid_pair(i, j): return False return True - pairs = [ - (i, j) - for i in a1 - for j in a2 - if valid_pair(i, j) - ] + pairs = [(i, j) for i in a1 for j in a2 if valid_pair(i, j)] i = 0 for p in pairs: @@ -117,7 +119,7 @@ def valid_pair(i, j): c1[6] = mate.chr c1[7] = abs(int(mate.pos)) - c1[8] = 76 #hack + c1[8] = 76 # hack c2[0] += "_" + str(i) c2[1] = int(c2[1]) | 2 & (~512) @@ -128,7 +130,7 @@ def valid_pair(i, j): c2[6] = read.chr c2[7] = abs(int(read.pos)) - c2[8] = 76 #hack + c2[8] = 76 # hack # Correct strandedness if int(read.pos) > 0: @@ -146,42 +148,37 @@ def valid_pair(i, j): sys.stdout.write("\t".join(c2)) return -#c2[0] += "_" + str(i) - -# new_line[0] = cols[0] + "_" + str(i) # read name -# new_line[1] = int(new_line[1]) | 2 & (~512) # flag -# new_line[2] = fields[0] # chrom -# new_line[3] = abs(int(fields[1])) # pos -# new_line[4] = "30" # quality -# new_line[5] = fields[2] # cigar -# new_line[6] = cols[6] # pair chrom -# new_line[7] = '???' # pair pos -# new_line[8] = cols[8] # template length - - + # c2[0] += "_" + str(i) + + # new_line[0] = cols[0] + "_" + str(i) # read name + # new_line[1] = int(new_line[1]) | 2 & (~512) # flag + # new_line[2] = fields[0] # chrom + # new_line[3] = abs(int(fields[1])) # pos + # new_line[4] = "30" # quality + # new_line[5] = fields[2] # cigar + # new_line[6] = cols[6] # pair chrom + # new_line[7] = '???' # pair pos + # new_line[8] = cols[8] # template length if not s1 and not s2: sys.stdout.write(r1) sys.stdout.write(r2) return - if s1 and s2: - #print("r1", r1) - #print("r2", r2) - #print("s1", s1) - #print("s2", s2) + # print("r1", r1) + # print("r2", r2) + # print("s1", s1) + # print("s2", s2) combinations = [] for i in s1: for j in s2: pass - return - #print("r1", r1) - #print("r2", r2) - + # print("r1", r1) + # print("r2", r2) def main(args=[]): @@ -196,5 +193,6 @@ def main(args=[]): except StopIteration: break + if __name__ == "__main__": main(sys.argv) diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 489fc2e9..24784f79 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -1,4 +1,4 @@ -""" This script is deprecated! """ +"""This script is deprecated!""" import json import os @@ -6,6 +6,7 @@ import argparse import logging import requests + try: from concurrent.futures import ThreadPoolExecutor except ImportError: @@ -15,7 +16,7 @@ logging.warn("This script is deprecated - consider using apilaneprocess.py instead!") -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") script_options = { "quiet": False, @@ -34,55 +35,90 @@ "tag_slug": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") - parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") - parser.add_argument("--lane", dest="lane_ids", type=int, action="append", - help="Lane ID") - - parser.add_argument("--flowcell_label", dest="flowcell_label", help="Flowcell Label") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this alignment to this file.", + ) + parser.add_argument( + "-b", + "--sample-script-basename", + dest="sample_script_basename", + help="Name of the script that goes after the sample name.", + ) + parser.add_argument( + "--lane", dest="lane_ids", type=int, action="append", help="Lane ID" + ) + + parser.add_argument( + "--flowcell_label", dest="flowcell_label", help="Flowcell Label" + ) parser.add_argument("--tag", dest="tag", help="Lanes tagged by") - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("--queue", dest="queue", - help="SLURM partition for jobs.") - - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="Don't use any barcode mask.") - parser.add_argument("--bases_mask", dest="bases_mask", - help="Set a bases mask.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument("--queue", dest="queue", help="SLURM partition for jobs.") + + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + parser.add_argument( + "--no-mask", + dest="no_mask", + action="store_true", + help="Don't use any barcode mask.", + ) + parser.add_argument("--bases_mask", dest="bases_mask", help="Set a bases mask.") + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser class ProcessSetUp(object): - - def __init__(self, args, api_url, token): - + def __init__(self, args, api_url, token): self.token = token self.api_url = api_url self.qsub_scriptname = args.sample_script_basename @@ -93,14 +129,13 @@ def __init__(self, args, api_url, token): self.dry_run = args.dry_run self.no_mask = args.no_mask self.session = requests.Session() - self.session.headers.update({'Authorization': "Token %s" % self.token}) + self.session.headers.update({"Authorization": "Token %s" % self.token}) self.pool = ThreadPoolExecutor(max_workers=10) def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = self.session.get(url) @@ -113,7 +148,6 @@ def api_single_result(self, url_addition=None, url=None): return None def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -121,7 +155,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = self.session.get(url) @@ -139,8 +172,9 @@ def api_list_result(self, url_addition=None, url=None): return results def get_lane_process_info(self, lane_id): - - info = self.session.get("%s/flowcell_lane/%d/processing_information" % (self.api_url, lane_id)) + info = self.session.get( + "%s/flowcell_lane/%d/processing_information" % (self.api_url, lane_id) + ) if info.ok: logging.debug(info.json()) @@ -151,29 +185,36 @@ def get_lane_process_info(self, lane_id): sys.exit(1) def get_process_template(self, process_template_id): - if not process_template_id: - logging.critical("No process template for alignment %d\n" % self.alignment_id) + logging.critical( + "No process template for alignment %d\n" % self.alignment_id + ) sys.exit(1) - info = self.session.get("%s/process_template/%d" % (self.api_url, process_template_id)) + info = self.session.get( + "%s/process_template/%d" % (self.api_url, process_template_id) + ) if info.ok: logging.debug(info.json()) return info.json() else: - logging.error("Could not find processing template for ID %d\n" % process_template_id) + logging.error( + "Could not find processing template for ID %d\n" % process_template_id + ) sys.exit(1) def setup_flowcell(self, flowcell_label): - - lanes = self.api_list_result("flowcell_lane?flowcell__label=%s" % flowcell_label) + lanes = self.api_list_result( + "flowcell_lane?flowcell__label=%s" % flowcell_label + ) self.setup_lanes([lane["id"] for lane in lanes]) def setup_tag(self, tag_slug): - - lane_tags = self.api_list_result("tagged_object?content_type=40&tag__slug=%s" % tag_slug) + lane_tags = self.api_list_result( + "tagged_object?content_type=40&tag__slug=%s" % tag_slug + ) self.setup_lanes([lane_tag["object_id"] for lane_tag in lane_tags]) @@ -181,31 +222,35 @@ def setup_lanes(self, lane_ids): self.pool.map(self.setup_lane, lane_ids) def setup_lane(self, lane_id): - processing_info = self.get_lane_process_info(lane_id) self.create_script(processing_info) def add_script(self, script_file, lane_id, flowcell_label, sample_name): - if not self.outfile: logging.debug("Writing script to stdout") outfile = sys.stdout else: logging.debug("Logging script to %s" % self.outfile) - outfile = open(self.outfile, 'a') + outfile = open(self.outfile, "a") outfile.write("cd %s && " % os.path.dirname(script_file)) - fullname = "%s%s-%s-Lane#%d" % (self.qsub_prefix,sample_name,flowcell_label,lane_id) - outfile.write("sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=8000 --parsable --oversubscribe <<__LANEPROC__\n#!/bin/bash\nbash %s\n__LANEPROC__\n\n" % (fullname, fullname, fullname, self.queue, script_file)) + fullname = "%s%s-%s-Lane#%d" % ( + self.qsub_prefix, + sample_name, + flowcell_label, + lane_id, + ) + outfile.write( + "sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=8000 --parsable --oversubscribe <<__LANEPROC__\n#!/bin/bash\nbash %s\n__LANEPROC__\n\n" + % (fullname, fullname, fullname, self.queue, script_file) + ) outfile.close() def get_script_template(self): - - return open(self.script_template, 'r').read() + return open(self.script_template, "r").read() def create_script(self, processing_info): - lane = processing_info["libraries"][0] if not "directory" in lane: @@ -214,62 +259,83 @@ def create_script(self, processing_info): fastq_directory = lane["directory"] - barcode = "NoIndex" if lane['barcode_index'] is None else lane['barcode_index'] + barcode = "NoIndex" if lane["barcode_index"] is None else lane["barcode_index"] try: # Preferred name - spreadsheet_name = lane['alignments'][0]['sample_name'] + spreadsheet_name = lane["alignments"][0]["sample_name"] logging.warning("Spreadsheet name: %s", spreadsheet_name) except (KeyError, IndexError): # Fallback method, doesn't always have the same barcode string - spreadsheet_name = "%s_%s_L00%d" % (lane['samplesheet_name'], barcode, lane['lane']) - logging.warning("No alignment sample_name for lane, using %s instead" % spreadsheet_name) + spreadsheet_name = "%s_%s_L00%d" % ( + lane["samplesheet_name"], + barcode, + lane["lane"], + ) + logging.warning( + "No alignment sample_name for lane, using %s instead" % spreadsheet_name + ) if not os.path.exists(fastq_directory): - logging.critical("fastq directory %s does not exist, cannot continue" % fastq_directory) + logging.critical( + "fastq directory %s does not exist, cannot continue" % fastq_directory + ) return False - script_file = os.path.join( fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) ) + script_file = os.path.join( + fastq_directory, "%s-%s" % (spreadsheet_name, self.qsub_scriptname) + ) if self.dry_run: logging.info("Dry run, would have created: %s" % script_file) return True try: - outfile = open(script_file, 'w') + outfile = open(script_file, "w") except FileNotFoundError: logging.critical("Could not create script file %s" % script_file) return False - self.add_script(script_file, lane["id"], processing_info["flowcell"]["label"], spreadsheet_name) + self.add_script( + script_file, + lane["id"], + processing_info["flowcell"]["label"], + spreadsheet_name, + ) outfile.write("set -e -o pipefail\n") outfile.write("export SAMPLE_NAME=%s\n" % spreadsheet_name) - outfile.write("export ASSAY=%s\n" % lane['assay']) - outfile.write("export READLENGTH=%s\n" % processing_info['flowcell']['read_length']) - if processing_info['flowcell']['paired_end']: + outfile.write("export ASSAY=%s\n" % lane["assay"]) + outfile.write( + "export READLENGTH=%s\n" % processing_info["flowcell"]["read_length"] + ) + if processing_info["flowcell"]["paired_end"]: outfile.write("export PAIRED=True\n") else: outfile.write("unset PAIRED\n") # Process with UMI if the barcode has one and this is a dual index # flowcell - if lane['barcode1'] and lane['barcode1']['umi'] and processing_info['flowcell']['dual_index']: + if ( + lane["barcode1"] + and lane["barcode1"]["umi"] + and processing_info["flowcell"]["dual_index"] + ): outfile.write("export UMI=True\n") else: outfile.write("unset UMI\n") - outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane['id']) + outfile.write("export FLOWCELL_LANE_ID=%s\n" % lane["id"]) outfile.write("export FASTQ_DIR=%s\n" % fastq_directory) - outfile.write("export FLOWCELL=%s\n" % processing_info['flowcell']['label']) + outfile.write("export FLOWCELL=%s\n" % processing_info["flowcell"]["label"]) outfile.write("\n") outfile.write(self.get_script_template()) outfile.close() -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -310,6 +376,7 @@ def main(args = sys.argv): if poptions.tag: process.setup_tag(poptions.tag) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/lims/aggregation/get_files.py b/scripts/lims/aggregation/get_files.py index 526433e0..0b08ea88 100644 --- a/scripts/lims/aggregation/get_files.py +++ b/scripts/lims/aggregation/get_files.py @@ -16,58 +16,70 @@ base_api_url = None log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -log = logging.getLogger('upload_data.py') +log = logging.getLogger("upload_data.py") script_options = { "base_api_url": None, "basedir": os.getcwd(), "quiet": False, "debug": False, - "aggregation_id": None, "library_number": None, "sample_number": None, "sublibrary": None, - "file_purpose": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) parser.add_argument("--aggregation_id", dest="aggregation_id", type=int) parser.add_argument("-l", "--library_number", dest="library_number") parser.add_argument("-p", "--file_purpose", dest="file_purpose") - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -class FileFetch(object): +class FileFetch(object): def __init__(self, api_url, token): - self.api_url = api_url - self.token = token - self.headers = {'Authorization': "Token %s" % token} + self.api_url = api_url + self.token = token + self.headers = {"Authorization": "Token %s" % token} def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = requests.get(url, headers=self.headers) @@ -80,7 +92,6 @@ def api_single_result(self, url_addition=None, url=None): return None def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -88,7 +99,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = requests.get(url, headers=self.headers) @@ -113,16 +123,16 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): if url_addition: url = "%s/%s" % (self.api_url, url_addition) - fetch_results = requests.get(url, headers = self.headers) + fetch_results = requests.get(url, headers=self.headers) if fetch_results.ok: results = fetch_results.json() - if results['count'] > 1: + if results["count"] > 1: log.error("More than one matching item for fetch query: %s" % url) - elif results['count'] == 0: + elif results["count"] == 0: log.debug("No matching items for fetch query: %s" % url) else: - result = results['results'][0] + result = results["results"][0] log.debug("Single result fetched from %s: %s" % (url, str(result))) if field: return result[field] @@ -133,14 +143,12 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): return None def get_file_purpose(self, slug): - - filepurpose_url = 'file_purpose/?slug=%s' % (slug) + filepurpose_url = "file_purpose/?slug=%s" % (slug) return self.api_single_list_result(filepurpose_url) def get_file_type(self, slug): - - filetype_url = 'file_type/?slug=%s' % (slug) + filetype_url = "file_type/?slug=%s" % (slug) return self.api_single_list_result(filetype_url) @@ -153,22 +161,35 @@ def retrieve_file(self, aggregation_id, file_purpose): logging.debug(aggregation) - files = self.api_list_result("file/?object_id=%d&object_content_type=%d&purpose__slug=%s" % (aggregation["id"], aggregation["object_content_type"], file_purpose["slug"])) + files = self.api_list_result( + "file/?object_id=%d&object_content_type=%d&purpose__slug=%s" + % ( + aggregation["id"], + aggregation["object_content_type"], + file_purpose["slug"], + ) + ) if len(files) > 1: - logging.critical("%d %s files found for aggregation %d" % (len(files), file_purpose["slug"], aggregation_id)) + logging.critical( + "%d %s files found for aggregation %d" + % (len(files), file_purpose["slug"], aggregation_id) + ) sys.exit(1) if not files: - logging.critical("%d %s files found for aggregation %d" % (len(files), file_purpose["slug"], aggregation_id)) + logging.critical( + "%d %s files found for aggregation %d" + % (len(files), file_purpose["slug"], aggregation_id) + ) sys.exit(1) print(files[0]["path"]) def find_single_aggregation(self, aggregations): - for aggregation in aggregations: - if aggregation['default_aggregation']: return aggregation + if aggregation["default_aggregation"]: + return aggregation return None @@ -176,8 +197,8 @@ def retrieve_library_file(self, library_number, file_purpose): library = self.api_single_list_result("library/?number=%d" % library_number) if not library: - logging.critical("Could not find library %d" % library_number) - sys.exit(1) + logging.critical("Could not find library %d" % library_number) + sys.exit(1) logging.debug(library) @@ -186,13 +207,24 @@ def retrieve_library_file(self, library_number, file_purpose): if len(aggregations) > 1: use_aggregation = self.find_single_aggregation(aggregations) if not use_aggregation: - logging.critical("More than one aggregation for library %d and no default found, must specify aggregation id" % (library_number)) - logging.critical("Options: " + ", ".join([aggregation["id"] for aggregation in aggregations])) + logging.critical( + "More than one aggregation for library %d and no default found, must specify aggregation id" + % (library_number) + ) + logging.critical( + "Options: " + + ", ".join([aggregation["id"] for aggregation in aggregations]) + ) return else: - logging.warn("More than one aggregation for library %d, using default" % (library_number)) + logging.warn( + "More than one aggregation for library %d, using default" + % (library_number) + ) elif len(aggregations) == 0: - logging.critical("Cannot find aggregations for library %d" % (library_number)) + logging.critical( + "Cannot find aggregations for library %d" % (library_number) + ) return elif len(aggregations) == 1: use_aggregation = aggregations[0] @@ -200,21 +232,21 @@ def retrieve_library_file(self, library_number, file_purpose): self.retrieve_file(use_aggregation["id"], file_purpose) def retrieve(self, aggregation_id, library_number, file_purpose_slug): + file_purpose = self.get_file_purpose(file_purpose_slug) - file_purpose = self.get_file_purpose(file_purpose_slug) + if not file_purpose: + logging.critical("Cannot find file purpose %s" % file_purpose_slug) + sys.exit(1) - if not file_purpose: - logging.critical("Cannot find file purpose %s" % file_purpose_slug) - sys.exit(1) + if aggregation_id: + self.retrieve_file(aggregation_id, file_purpose) + if library_number: + self.retrieve_library_file(library_number, file_purpose) - if aggregation_id: - self.retrieve_file(aggregation_id, file_purpose) - if library_number: - self.retrieve_library_file(library_number, file_purpose) -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -252,7 +284,9 @@ def main(args = sys.argv): try: library_number = int(poptions.library_number.strip(string.ascii_letters)) except ValueError: - logging.critical("Could not get library number from %s" % poptions.library_number) + logging.critical( + "Could not get library number from %s" % poptions.library_number + ) sys.exit() else: library_number = None @@ -261,6 +295,7 @@ def main(args = sys.argv): fetch.retrieve(poptions.aggregation_id, library_number, poptions.file_purpose) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/lims/alignment/get_files.py b/scripts/lims/alignment/get_files.py index 01e4b6c8..46699089 100644 --- a/scripts/lims/alignment/get_files.py +++ b/scripts/lims/alignment/get_files.py @@ -16,14 +16,13 @@ base_api_url = None log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -log = logging.getLogger('upload_data.py') +log = logging.getLogger("upload_data.py") script_options = { "base_api_url": None, "basedir": os.getcwd(), "quiet": False, "debug": False, - "alignment_id": None, "lane_id": None, "library": None, @@ -32,24 +31,38 @@ "barcode": None, "flowcell": None, "lane": None, - "file_purpose": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) parser.add_argument("--aggregation_id", dest="aggregation_id", type=int) parser.add_argument("--lane_id", dest="lane_id", type=int) @@ -61,22 +74,21 @@ def parser_setup(): parser.add_argument("-p", "--file_purpose", dest="file_purpose") - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -class FileFetch(object): +class FileFetch(object): def __init__(self, api_url, token): - self.api_url = api_url - self.token = token - self.headers = {'Authorization': "Token %s" % token} + self.api_url = api_url + self.token = token + self.headers = {"Authorization": "Token %s" % token} def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = requests.get(url, headers=self.headers) @@ -89,7 +101,6 @@ def api_single_result(self, url_addition=None, url=None): return None def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -97,7 +108,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = requests.get(url, headers=self.headers) @@ -122,16 +132,16 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): if url_addition: url = "%s/%s" % (self.api_url, url_addition) - fetch_results = requests.get(url, headers = self.headers) + fetch_results = requests.get(url, headers=self.headers) if fetch_results.ok: results = fetch_results.json() - if results['count'] > 1: + if results["count"] > 1: log.error("More than one matching item for fetch query: %s" % url) - elif results['count'] == 0: + elif results["count"] == 0: log.debug("No matching items for fetch query: %s" % url) else: - result = results['results'][0] + result = results["results"][0] log.debug("Single result fetched from %s: %s" % (url, str(result))) if field: return result[field] @@ -142,14 +152,12 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): return None def get_file_purpose(self, slug): - - filepurpose_url = 'file_purpose/?slug=%s' % (slug) + filepurpose_url = "file_purpose/?slug=%s" % (slug) return self.api_single_list_result(filepurpose_url) def get_file_type(self, slug): - - filetype_url = 'file_type/?slug=%s' % (slug) + filetype_url = "file_type/?slug=%s" % (slug) return self.api_single_list_result(filetype_url) @@ -161,7 +169,10 @@ def retrieve_file(self, alignment_id, file_purpose): sys.exit(1) logging.debug(alignment) - generic_relation_query = "object_id=%d&object_content_type=%d&purpose__slug=%s" % (alignment["id"], alignment["object_content_type"], file_purpose["slug"]) + generic_relation_query = ( + "object_id=%d&object_content_type=%d&purpose__slug=%s" + % (alignment["id"], alignment["object_content_type"], file_purpose["slug"]) + ) files = self.api_list_result("file/?%s" % generic_relation_query) if len(files) == 1: @@ -173,24 +184,34 @@ def retrieve_file(self, alignment_id, file_purpose): sys.stdout.write(directories[0]["path"] + "\n") if len(files) > 1: - logging.critical("%d %s files found for alignment %d" % (len(files), file_purpose["slug"], alignment_id)) + logging.critical( + "%d %s files found for alignment %d" + % (len(files), file_purpose["slug"], alignment_id) + ) sys.exit(1) if len(directories) > 1: - logging.critical("%d %s directories found for alignment %d" % (len(directories), file_purpose["slug"], alignment_id)) + logging.critical( + "%d %s directories found for alignment %d" + % (len(directories), file_purpose["slug"], alignment_id) + ) if not files and not directories: - logging.critical("No files or directories found for alignment %d" % alignment_id) + logging.critical( + "No files or directories found for alignment %d" % alignment_id + ) sys.exit(1) def find_single_alignment(self, lane): + alignments = self.api_list_result( + "flowcell_lane_alignment/?lane=%d" % lane["id"] + ) - alignments = self.api_list_result("flowcell_lane_alignment/?lane=%d" % lane["id"]) - - if len (alignments) > 1: + if len(alignments) > 1: logging.warn("More than one alignment found, finding default") for alignment in alignments: - if alignment['default_lane_alignment']: return alignment + if alignment["default_lane_alignment"]: + return alignment return None @@ -201,7 +222,9 @@ def find_lanes(self, args): if args.flowcell.startswith("FC"): args.flowcell = args.flowcell[2:] if len(args.flowcell) != 5: - logging.warn("Flowcell label %s is not five characters long" % args.flowcell) + logging.warn( + "Flowcell label %s is not five characters long" % args.flowcell + ) query["flowcell__label"] = args.flowcell if args.lane_id: @@ -235,10 +258,12 @@ def find_lanes(self, args): logging.critical("Could not turn %s into sample number" % args.sample) query["sample__number"] = sample_number - return self.api_list_result("flowcell_lane/?%s" % "&".join(["%s=%s" % (item, value) for item, value in query.items()])) + return self.api_list_result( + "flowcell_lane/?%s" + % "&".join(["%s=%s" % (item, value) for item, value in query.items()]) + ) def retrieve(self, args): - file_purpose = self.get_file_purpose(args.file_purpose) if not file_purpose: @@ -268,9 +293,10 @@ def retrieve(self, args): self.retrieve_file(alignment["id"], file_purpose) -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -312,6 +338,7 @@ def main(args = sys.argv): fetch.retrieve(poptions) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index 0d7fd24b..21e8b413 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -10,6 +10,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log = logging.getLogger(__name__) + def parse_json(filename): with open(filename) as f: return json.loads(f.read()) @@ -28,12 +29,12 @@ def group_data(processing_info) -> dict: Returns dict of tuple keys, values are a list of library numbers """ output = defaultdict(list) - for lib in processing_info['libraries']: - lib_number = lib['library'] + for lib in processing_info["libraries"]: + lib_number = lib["library"] key = ( - lib['barcode1']['reverse_sequence'], - lib['lane'], - ) + lib["barcode1"]["reverse_sequence"], + lib["lane"], + ) output[key].append(lib_number) return output @@ -41,38 +42,49 @@ def group_data(processing_info) -> dict: def to_tsv(label, data): lines = ["pool_name\tsample_name\tlane\tbarcode_index"] - for datum in sorted(data, key=lambda d: (d['lane'], d['pool_name'], d['sample_name'])): - lines.append("\t".join([ - label + "_" + datum["pool_name"], - datum["sample_name"], - str(datum["lane"]), - datum["barcode_index"], - ] )) + for datum in sorted( + data, key=lambda d: (d["lane"], d["pool_name"], d["sample_name"]) + ): + lines.append( + "\t".join( + [ + label + "_" + datum["pool_name"], + datum["sample_name"], + str(datum["lane"]), + datum["barcode_index"], + ] + ) + ) return "\n".join(lines) + "\n" + def get_config_info(processing_data, ds_number: int): pass + def construct_config_entries(data: dict) -> [dict]: # Maps library number -> (pool_name, barcode1) pool_lookup_table = {} - for (pool, values) in data["library_pools"].items(): + for pool, values in data["library_pools"].items(): value = (pool, values["barcode1"]) - for lib_str in values['libraries']: + for lib_str in values["libraries"]: lib_num = int(lib_str.replace("LN", "")) # Discard the 'LN' prefix if lib_num in pool_lookup_table: - raise ValueError("Libnum in more than one pool, %s and %s" % (pool_lookup_table[lib_num], value)) + raise ValueError( + "Libnum in more than one pool, %s and %s" + % (pool_lookup_table[lib_num], value) + ) pool_lookup_table[lib_num] = pool results = [] - for (library) in data['libraries']: + for library in data["libraries"]: datum = { - "barcode_index" : library["barcode_index"], - "sample_name" : library["samplesheet_name"], - "pool_name" : pool_lookup_table[library["library"]], - "lane" : library["lane"], + "barcode_index": library["barcode_index"], + "sample_name": library["samplesheet_name"], + "pool_name": pool_lookup_table[library["library"]], + "lane": library["lane"], } - results.append(datum) + results.append(datum) return results @@ -83,8 +95,9 @@ def main(): entries = construct_config_entries(poptions.data) tsv = to_tsv(label, entries) - with open(poptions.output, 'w') as f: + with open(poptions.output, "w") as f: f.write(tsv) + if __name__ == "__main__": main() diff --git a/scripts/lims/get_processing.py b/scripts/lims/get_processing.py index a0d91797..da616987 100644 --- a/scripts/lims/get_processing.py +++ b/scripts/lims/get_processing.py @@ -24,51 +24,88 @@ "outfile": "processing.json", } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't logging.info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - parser.add_argument("-f", "--flowcell", dest="flowcell", - help="The flowcell we want to get processing info for.") - parser.add_argument("-p", "--project", dest="project", - help="The project to get processing info for.") - parser.add_argument("-g", "--alignment-group", dest="alignment_group", type=int, - help="A specific aligment group to get processing info for.") - parser.add_argument("-e", "--experiment", dest="experiment", - help="The experiment to get processing info for.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="The outfile to save to.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't logging.info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + parser.add_argument( + "-f", + "--flowcell", + dest="flowcell", + help="The flowcell we want to get processing info for.", + ) + parser.add_argument( + "-p", + "--project", + dest="project", + help="The project to get processing info for.", + ) + parser.add_argument( + "-g", + "--alignment-group", + dest="alignment_group", + type=int, + help="A specific aligment group to get processing info for.", + ) + parser.add_argument( + "-e", + "--experiment", + dest="experiment", + help="The experiment to get processing info for.", + ) + + parser.add_argument( + "-o", "--outfile", dest="outfile", help="The outfile to save to." + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser + def get_processing_info_project(api_url, token, id, outfile): # get all LNs # then get all AGGs # then get all AGG info logging.info("Setting up project #%s" % id) - - info = requests.get("%s/aggregation/file_detail/?library__sample__tissue_culture__project=%s&page_size=1000" % (api_url,id), - headers={'Authorization': "Token %s" % token}) + + info = requests.get( + "%s/aggregation/file_detail/?library__sample__tissue_culture__project=%s&page_size=1000" + % (api_url, id), + headers={"Authorization": "Token %s" % token}, + ) if info.ok: result = info.json() - with open(outfile, 'w') as output: - json.dump(result, output, sort_keys=True, indent=4, separators=(',', ': ')) + with open(outfile, "w") as output: + json.dump(result, output, sort_keys=True, indent=4, separators=(",", ": ")) else: logging.error("info was not found within API") @@ -76,15 +113,17 @@ def get_processing_info_project(api_url, token, id, outfile): def get_processing_info_experiment(api_url, token, id, outfile): - logging.info("Setting up experiment #%s" % id) - info = requests.get("%s/experiment/%s/schema" % (api_url, id), headers={'Authorization': "Token %s" % token}) + info = requests.get( + "%s/experiment/%s/schema" % (api_url, id), + headers={"Authorization": "Token %s" % token}, + ) if info.ok: result = info.json() - with open(outfile, 'w') as output: - json.dump(result, output, indent=4, separators=(',', ': ')) + with open(outfile, "w") as output: + json.dump(result, output, indent=4, separators=(",", ": ")) else: logging.error("info was not found within API") @@ -92,24 +131,27 @@ def get_processing_info_experiment(api_url, token, id, outfile): def get_processing_info_alignment_group(api_url, token, id, outfile): - - info = requests.get("%s/flowcell_lane_alignment_group/%d/processing_information/" % (api_url, id), - headers={'Authorization': "Token %s" % token}) + info = requests.get( + "%s/flowcell_lane_alignment_group/%d/processing_information/" % (api_url, id), + headers={"Authorization": "Token %s" % token}, + ) if info.ok: result = info.json() logging.info("Writing results to %s" % outfile) - with open(outfile, 'w') as output: - json.dump(result, output, sort_keys=True, indent=4, separators=(',', ': ')) + with open(outfile, "w") as output: + json.dump(result, output, sort_keys=True, indent=4, separators=(",", ": ")) else: - logging.error("Could not find processing info for alignment group %s\n" % str(id)) + logging.error( + "Could not find processing info for alignment group %s\n" % str(id) + ) return -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -140,15 +182,19 @@ def main(args = sys.argv): sys.exit(1) if poptions.project: - logging.info("Getting aggregation information for project #%s" % poptions.project) + logging.info( + "Getting aggregation information for project #%s" % poptions.project + ) get_processing_info_project(api_url, token, poptions.project, poptions.outfile) if poptions.flowcell: - logging.info("Getting alignment groups for %s" % poptions.flowcell) - alignment_groups = requests.get("%s/flowcell_lane_alignment_group/?flowcell__label=%s" % (api_url, poptions.flowcell), - headers={'Authorization': "Token %s" % token}) + alignment_groups = requests.get( + "%s/flowcell_lane_alignment_group/?flowcell__label=%s" + % (api_url, poptions.flowcell), + headers={"Authorization": "Token %s" % token}, + ) if not alignment_groups.ok: logging.error("Could not get alignment groups for flowcell") @@ -157,21 +203,34 @@ def main(args = sys.argv): results = alignment_groups.json() if results["count"] == 0: - logging.error("Could not find an alignment group for flowcell %s\n" % poptions.flowcell) - sys.exit(1) + logging.error( + "Could not find an alignment group for flowcell %s\n" + % poptions.flowcell + ) + sys.exit(1) if results["count"] > 1: - logging.error("More than one alignment group found: %s" % ", ".join(["%d" % ag["id"] for ag in results['results']])) - sys.exit(1) + logging.error( + "More than one alignment group found: %s" + % ", ".join(["%d" % ag["id"] for ag in results["results"]]) + ) + sys.exit(1) - get_processing_info_alignment_group(api_url, token, results['results'][0]["id"], poptions.outfile) + get_processing_info_alignment_group( + api_url, token, results["results"][0]["id"], poptions.outfile + ) if poptions.alignment_group: - get_processing_info_alignment_group(api_url, token, poptions.alignment_group, poptions.outfile) + get_processing_info_alignment_group( + api_url, token, poptions.alignment_group, poptions.outfile + ) if poptions.experiment: - logging.info("Getting aggregation information for experiment #%s" % poptions.experiment) - get_processing_info_experiment(api_url, token, poptions.experiment, poptions.outfile) - + logging.info( + "Getting aggregation information for experiment #%s" % poptions.experiment + ) + get_processing_info_experiment( + api_url, token, poptions.experiment, poptions.outfile + ) ############ diff --git a/scripts/lims/movetag.py b/scripts/lims/movetag.py index bc25a2a9..035e5045 100644 --- a/scripts/lims/movetag.py +++ b/scripts/lims/movetag.py @@ -23,92 +23,112 @@ "new_tag": None, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - - parser.add_argument("--content_type", dest="content_type", - help="Name of the contenttype.") - parser.add_argument("--object", dest="object_id", type=int, - help="Object ID.") - - parser.add_argument("-a", "--add_tag", dest="new_tag", - help="The new tag slug.") - parser.add_argument("-r", "--remove_tag", dest="old_tag", - help="The old tag slug.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + + parser.add_argument( + "--content_type", dest="content_type", help="Name of the contenttype." + ) + parser.add_argument("--object", dest="object_id", type=int, help="Object ID.") + + parser.add_argument("-a", "--add_tag", dest="new_tag", help="The new tag slug.") + parser.add_argument("-r", "--remove_tag", dest="old_tag", help="The old tag slug.") + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -class TagChange(object): +class TagChange(object): def __init__(self, api_url, token): - self.api_url = api_url - self.token = token - self.headers = {'Authorization': "Token %s" % token} - self.contenttypes = {} + self.api_url = api_url + self.token = token + self.headers = {"Authorization": "Token %s" % token} + self.contenttypes = {} def get_contenttype(self, contenttype): if not contenttype in self.contenttypes: - contenttype_url = '%s/content_type/?model=%s' % (self.api_url, contenttype) - contenttype_results = requests.get(contenttype_url, headers = self.headers).json() - self.contenttypes[contenttype] = contenttype_results['results'][0] + contenttype_url = "%s/content_type/?model=%s" % (self.api_url, contenttype) + contenttype_results = requests.get( + contenttype_url, headers=self.headers + ).json() + self.contenttypes[contenttype] = contenttype_results["results"][0] return self.contenttypes[contenttype] def get_tag(self, slug): - if not slug: return None - exists = requests.get("%s/tag/?slug=%s" % (self.api_url, slug), headers = self.headers) + exists = requests.get( + "%s/tag/?slug=%s" % (self.api_url, slug), headers=self.headers + ) tag = None if exists.ok: results = exists.json() - if results['count'] > 0: - return results['results'][0] + if results["count"] > 0: + return results["results"][0] else: - print "Tag %s not found" % slug + print("Tag %s not found" % slug) return None else: - print "Error finding tag %s through API" % slug + print("Error finding tag %s through API" % slug) return None def change_tag(self, contenttype, object_id, old_tag, new_tag): - contenttype_id = self.get_contenttype(contenttype)["id"] - current = requests.get("%s/tagged_object/?content_type=%d&object_id=%d&tag__slug=%s" % ( - self.api_url, contenttype_id, object_id, old_tag), headers = self.headers).json() + current = requests.get( + "%s/tagged_object/?content_type=%d&object_id=%d&tag__slug=%s" + % (self.api_url, contenttype_id, object_id, old_tag), + headers=self.headers, + ).json() - if current['count'] == 0: + if current["count"] == 0: return False - taggedobject = current['results'][0] + taggedobject = current["results"][0] new_tag = self.get_tag(new_tag) taggedobject["tag"] = new_tag["url"] - result = requests.put(taggedobject['url'], headers = self.headers, data = taggedobject) + result = requests.put( + taggedobject["url"], headers=self.headers, data=taggedobject + ) if not result.ok: return False return True -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -138,12 +158,15 @@ def main(args = sys.argv): sys.exit(1) tagchange = TagChange(api_url, token) - result = tagchange.change_tag(poptions.content_type, poptions.object_id, poptions.old_tag, poptions.new_tag) + result = tagchange.change_tag( + poptions.content_type, poptions.object_id, poptions.old_tag, poptions.new_tag + ) if not result: sys.stderr.write("Tag not changed") sys.exit(1) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py index 10cc72ef..c1ad3de4 100644 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -3,6 +3,7 @@ import argparse import logging + # Change to logging.DEBUG to see all messages logging.basicConfig(level=logging.WARN) @@ -12,49 +13,66 @@ from stamlims_api.rest import setup_api from stamlims_api.lims import aggregations, metrics -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) parser.add_argument("--aggregation", dest="aggregation_id", type=int) - parser.add_argument("-f", "--counts_file", nargs="+", help="Tab delimited file of count\tvalue") + parser.add_argument( + "-f", "--counts_file", nargs="+", help="Tab delimited file of count\tvalue" + ) parser.add_argument("--spot", dest="spot_file") - parser.set_defaults( quiet=False, debug=False ) + parser.set_defaults(quiet=False, debug=False) return parser + # aggregation def upload_stats(api, aggregation, stats={}): - data = [{ - "object_id": aggregation, - "content_type": "aggregation", - "stats": stats, - }] + data = [ + { + "object_id": aggregation, + "content_type": "aggregation", + "stats": stats, + } + ] response = api.post_single_result(url_addition="stat/create", json=data) if response is None: raise Exception("Upload failed") - + log.info(response) + def upload_spot(api, aggregation, spot_file): if not os.path.exists(spot_file): log.error("Cannot find spot file %s" % spot_file) return - spot = open(spot_file, 'r').read().strip() + spot = open(spot_file, "r").read().strip() try: spot = Decimal(spot) - upload_stat(api, aggregation, 'hotspot2-SPOT', spot) + upload_stat(api, aggregation, "hotspot2-SPOT", spot) except ValueError: log.error("Could not turn %s into decimal" % spot) + def upload_file(api, aggregation, counts_file): - count_content = open(counts_file, 'r') + count_content = open(counts_file, "r") log.info("uploading {}".format(counts_file)) stats = {} @@ -71,22 +89,25 @@ def upload_file(api, aggregation, counts_file): try: float(value) except ValueError: - log.warn("skipping stat-type '{}' with non-numeric value '{}'".format(stat_type_name, value)) + log.warn( + "skipping stat-type '{}' with non-numeric value '{}'".format( + stat_type_name, value + ) + ) continue - if not stat_type_name: log.warn("skipping {}".format(stat_type_name)) continue stats[stat_type_name] = value - log.debug( "{} : {}".format(stat_type_name, value)) + log.debug("{} : {}".format(stat_type_name, value)) count_content.close() upload_stats(api, aggregation, stats) -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -112,6 +133,7 @@ def main(args = sys.argv): for count_file in poptions.counts_file: upload_file(api, poptions.aggregation_id, count_file) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 4f60af32..389acac7 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -1,4 +1,4 @@ -#pylint disable=invalid-whitespace, invalid-name +# pylint disable=invalid-whitespace, invalid-name import argparse import datetime @@ -12,10 +12,8 @@ from zipfile import ZipFile sys.path.insert( - 1, os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "stamlims_api" -)) + 1, os.path.join(os.path.dirname(os.path.abspath(__file__)), "stamlims_api") +) from stamlims_api.lims import aggregations, content_types from stamlims_api import rest @@ -25,161 +23,259 @@ flowcell_contenttype = None log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -log = logging.getLogger('upload_data.py') +log = logging.getLogger("upload_data.py") script_options = { "base_api_url": None, "basedir": os.getcwd(), "quiet": False, "debug": False, - "aggregation_id": None, "start_aggregation": False, "complete_aggregation": False, "clear_aggregation_stats": False, - "alignment_id": None, "flowcell": None, "flowcell_lane_id": None, - "fastqc_counts": False, "fastqc_files": [], - "spot_file": None, "spot_dup_file": None, "dups_file": None, "counts_file": None, "rna_file": None, "barcode_report_file": None, - "version_file": None, "adapter_file": None, - "align_start_time": False, "align_complete_time": False, - "attach_file": None, "attach_directory": None, "attach_file_contenttype": None, "attach_file_objectid": None, "attach_file_purpose": None, "attach_file_type": None, - "clear_align_stats": False, - "skip_md5_check": False, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - parser.add_argument("-f", "--flowcell", dest="flowcell", - help="The flowcell we're working on. Enter it to clear cache after uploading.") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + parser.add_argument( + "-f", + "--flowcell", + dest="flowcell", + help="The flowcell we're working on. Enter it to clear cache after uploading.", + ) parser.add_argument("--aggregation_id", dest="aggregation_id", type=int) - parser.add_argument("--clear_aggregation_stats", dest="clear_aggregation_stats", action="store_true", - help="Clear the statistics/files for a given aggregation.") - parser.add_argument("--start_aggregation", dest="start_aggregation", action="store_true", - help="Set the current time for the alignment's start time.") - parser.add_argument("--complete_aggregation", dest="complete_aggregation", action="store_true", - help="Set the current time for the alignment's complete time.") - - parser.add_argument("--clear_align_stats", dest="clear_align_stats", action="store_true", - help="Clear the statistics/files for a given alignment.") + parser.add_argument( + "--clear_aggregation_stats", + dest="clear_aggregation_stats", + action="store_true", + help="Clear the statistics/files for a given aggregation.", + ) + parser.add_argument( + "--start_aggregation", + dest="start_aggregation", + action="store_true", + help="Set the current time for the alignment's start time.", + ) + parser.add_argument( + "--complete_aggregation", + dest="complete_aggregation", + action="store_true", + help="Set the current time for the alignment's complete time.", + ) + + parser.add_argument( + "--clear_align_stats", + dest="clear_align_stats", + action="store_true", + help="Clear the statistics/files for a given alignment.", + ) # these should go together parser.add_argument("--alignment_id", dest="alignment_id", type=int) - parser.add_argument("--spotfile", dest="spot_file", - help="The SPOT output file. Best paired with a dupfile. Needs alignment id.") - parser.add_argument("--spotdupfile", dest="spot_dup_file", - help="The Picard dup results file paired with a spotfile. Needs alignment id.") + parser.add_argument( + "--spotfile", + dest="spot_file", + help="The SPOT output file. Best paired with a dupfile. Needs alignment id.", + ) + parser.add_argument( + "--spotdupfile", + dest="spot_dup_file", + help="The Picard dup results file paired with a spotfile. Needs alignment id.", + ) # requires alignment_id - parser.add_argument("--start_alignment_progress", dest="align_start_time", action="store_true", - help="Set the current time for the alignment's start time.") - parser.add_argument("--finish_alignment", dest="align_complete_time", action="store_true", - help="Set the current time for the alignment's complete time.") + parser.add_argument( + "--start_alignment_progress", + dest="align_start_time", + action="store_true", + help="Set the current time for the alignment's start time.", + ) + parser.add_argument( + "--finish_alignment", + dest="align_complete_time", + action="store_true", + help="Set the current time for the alignment's complete time.", + ) # also needs alignment_id - parser.add_argument("--countsfile", dest="counts_file", - help="A tab delineated list of counts. Needs alignnment id.") + parser.add_argument( + "--countsfile", + dest="counts_file", + help="A tab delineated list of counts. Needs alignnment id.", + ) # requires alignment_id - parser.add_argument("--version_file", dest="version_file", - help="A version file for alignments.") - parser.add_argument("--adapter_file", dest="adapter_file", - help="An adapter file for alignments.") - - # A lane can have multiple fastQC files, one for each read - parser.add_argument("--flowcell_lane_id", dest="flowcell_lane_id", type=int, - help="The ID of the flowcell lane we're working on.") - parser.add_argument("--fastqcfile", dest="fastqc_files", action="append", - help="A FastQC ZIP file to upload.") - parser.add_argument("--insertsfile", dest="inserts_file", - help="A Picard CollectInsertSizeMetrics text file for an alignment.") - parser.add_argument("--dupsfile", dest="dups_file", - help="A Picard MarkDuplicates text file for an alignment.") - parser.add_argument("--rnafile", dest="rna_file", - help="The RNA metric output file") - parser.add_argument("--fastqc_counts", dest="fastqc_counts", action="store_true", - help="Use the given fastqc files to create total/pf/qc counts. Must have an alignment id.") - parser.add_argument("--barcode_report", dest="barcode_report_file", - help="The barcode report JSON file") - - parser.add_argument("--attach_file", dest="attach_file", - help="The full path to a file to attach to a LIMS object.") - parser.add_argument("--attach_directory", dest="attach_directory", - help="The full path to a directory to attach to a LIMS object.") - parser.add_argument("--attach_file_contenttype", dest="attach_file_contenttype", - help="The content type to attach to, aka SequencingData.flowcelllanealignment") - parser.add_argument("--attach_file_objectid", dest="attach_file_objectid", type=int, - help="The object ID to attach to.") - parser.add_argument("--attach_file_purpose", dest="attach_file_purpose", - help="The file's purpose slug.") - parser.add_argument("--attach_file_type", dest="attach_file_type", - help="The file's type slug.") - - parser.add_argument("--skip_md5_check", dest="skip_md5_check", action="store_true", - help="If file exists and path/size match, don't check md5sum.") - - parser.set_defaults( **script_options ) - parser.set_defaults( quiet=False, debug=False ) + parser.add_argument( + "--version_file", dest="version_file", help="A version file for alignments." + ) + parser.add_argument( + "--adapter_file", dest="adapter_file", help="An adapter file for alignments." + ) + + # A lane can have multiple fastQC files, one for each read + parser.add_argument( + "--flowcell_lane_id", + dest="flowcell_lane_id", + type=int, + help="The ID of the flowcell lane we're working on.", + ) + parser.add_argument( + "--fastqcfile", + dest="fastqc_files", + action="append", + help="A FastQC ZIP file to upload.", + ) + parser.add_argument( + "--insertsfile", + dest="inserts_file", + help="A Picard CollectInsertSizeMetrics text file for an alignment.", + ) + parser.add_argument( + "--dupsfile", + dest="dups_file", + help="A Picard MarkDuplicates text file for an alignment.", + ) + parser.add_argument("--rnafile", dest="rna_file", help="The RNA metric output file") + parser.add_argument( + "--fastqc_counts", + dest="fastqc_counts", + action="store_true", + help="Use the given fastqc files to create total/pf/qc counts. Must have an alignment id.", + ) + parser.add_argument( + "--barcode_report", + dest="barcode_report_file", + help="The barcode report JSON file", + ) + + parser.add_argument( + "--attach_file", + dest="attach_file", + help="The full path to a file to attach to a LIMS object.", + ) + parser.add_argument( + "--attach_directory", + dest="attach_directory", + help="The full path to a directory to attach to a LIMS object.", + ) + parser.add_argument( + "--attach_file_contenttype", + dest="attach_file_contenttype", + help="The content type to attach to, aka SequencingData.flowcelllanealignment", + ) + parser.add_argument( + "--attach_file_objectid", + dest="attach_file_objectid", + type=int, + help="The object ID to attach to.", + ) + parser.add_argument( + "--attach_file_purpose", + dest="attach_file_purpose", + help="The file's purpose slug.", + ) + parser.add_argument( + "--attach_file_type", dest="attach_file_type", help="The file's type slug." + ) + + parser.add_argument( + "--skip_md5_check", + dest="skip_md5_check", + action="store_true", + help="If file exists and path/size match, don't check md5sum.", + ) + + parser.set_defaults(**script_options) + parser.set_defaults(quiet=False, debug=False) return parser -def split_sample_name(samplename): - m = re.match(r'(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])', samplename) +def split_sample_name(samplename): + m = re.match( + r"(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])", + samplename, + ) if not m: log.error("Could not parse sample name: %s" % samplename) return None - return { "sample": m.group('sample'), "barcode": m.group('barcode'), "lane": m.group('lane') } + return { + "sample": m.group("sample"), + "barcode": m.group("barcode"), + "lane": m.group("lane"), + } + def get_spot_score(spot_file): - - contents = open(spot_file, 'r').read() + contents = open(spot_file, "r").read() stats = contents.split("\n")[1].split() - return {"total_tags": int(stats[0]), "tags_in_hotspots": int(stats[1]), "spot_score": stats[2]} + return { + "total_tags": int(stats[0]), + "tags_in_hotspots": int(stats[1]), + "spot_score": stats[2], + } + def get_dup_score(spotdup_file): if not spotdup_file: return None - infile = open(spotdup_file, 'r') + infile = open(spotdup_file, "r") try: for line in infile: @@ -195,40 +291,42 @@ def get_dup_score(spotdup_file): return None -def get_fastqc_counts(fastqc_input): - total_m = re.search(r'Total Sequences\t(?P\d+)', fastqc_input) +def get_fastqc_counts(fastqc_input): + total_m = re.search(r"Total Sequences\t(?P\d+)", fastqc_input) if not total_m: log.error("Could not get total sequences from fastqc_input") return None - filtered_m = re.search(r'Filtered Sequences\t(?P\d+)', fastqc_input) + filtered_m = re.search(r"Filtered Sequences\t(?P\d+)", fastqc_input) if not filtered_m: log.error("Could not get filtered sequences from fastqc_input") return None return { - 'total': int(total_m.group('total')), - 'filtered': int(filtered_m.group('filtered')), + "total": int(total_m.group("total")), + "filtered": int(filtered_m.group("filtered")), } + def md5sum_file(path): md5sum = hashlib.md5() - with open(path, 'rb') as f: - for chunk in iter(lambda: f.read(1024*1024), b''): + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): md5sum.update(chunk) return md5sum.hexdigest() + def url_join(*args): - url = "/".join([ x.rstrip('/') for x in args ]) + url = "/".join([x.rstrip("/") for x in args]) return url -class UploadLIMS(object): +class UploadLIMS(object): def __init__(self, api_url, token): self.fastqc_tags = None self.count_types = {} @@ -239,9 +337,13 @@ def __init__(self, api_url, token): self.alignment_counts = {} self.picard_metrics = None self.fastqc_counts = {} - self.api = rest.setup_api({rest.LIMS_URL_OPT_VAR: api_url, - rest.LIMS_TOKEN_OPT_VAR: token, - rest.RAISE_ON_ERROR_VAR: True}) + self.api = rest.setup_api( + { + rest.LIMS_URL_OPT_VAR: api_url, + rest.LIMS_TOKEN_OPT_VAR: token, + rest.RAISE_ON_ERROR_VAR: True, + } + ) self.get_cache = {} def get(self, url): @@ -267,7 +369,9 @@ def get_single_result(self, fetch_url, query=None, field=None): """ Using a list API url that should bring up a single item, retrieve that single item if it exists. """ - result = self.api.get_single_list_result(url_addition=fetch_url, query_arguments=query) + result = self.api.get_single_list_result( + url_addition=fetch_url, query_arguments=query + ) if result is None: return None if field is not None: @@ -293,9 +397,9 @@ def patch(self, *args, **kwargs): return self.api.patch_single_result(*args, **kwargs) def get_flowcell_url_by_label(self, label): - return self.get_single_result('flowcell_run/', - field = 'url', - query={"label":label}) + return self.get_single_result( + "flowcell_run/", field="url", query={"label": label} + ) def clear_flowcell_cache(self, flowcell): url = self.get_flowcell_url_by_label(flowcell) @@ -316,7 +420,7 @@ def clear_aggregation_stats(self, aggregation_id): log.debug("Clearing stats: %s" % url) results = self.post(url) if results is None: - log.error("Could not clear aggregation stats for AGG%s" % aggregation_id) + log.error("Could not clear aggregation stats for AGG%s" % aggregation_id) def start_aggregation(self, aggregation_id): url = "aggregation/%d/" % aggregation_id @@ -341,24 +445,24 @@ def complete_aggregation(self, aggregation_id): log.error("Could not complete AGG%s" % aggregation_id) def get_fastqc_tags(self): - if not self.fastqc_tags: - tags = self.get_list_result('fastqc_tag/') + tags = self.get_list_result("fastqc_tag/") if tags is None: log.critical("Could not fetch fastqc tags from LIMS") - self.fastqc_tags = dict([(tag['slug'], tag) for tag in tags]) + self.fastqc_tags = dict([(tag["slug"], tag) for tag in tags]) return self.fastqc_tags def get_picard_metrics(self): - if not self.picard_metrics: - picard_metrics = self.get_list_result('picard_metric/') + picard_metrics = self.get_list_result("picard_metric/") if picard_metrics is None: log.critical("Could not fetch picard metrics from LIMS") - self.picard_metrics = dict([(metric['name'], metric) for metric in picard_metrics]) + self.picard_metrics = dict( + [(metric["name"], metric) for metric in picard_metrics] + ) return self.picard_metrics @@ -370,31 +474,32 @@ def get_contenttype(self, contenttype_name): (appname, modelname) = contenttype_name.split(".") query = { - 'app_label': appname, - 'model': modelname, + "app_label": appname, + "model": modelname, } - ct = self.get_single_result('content_type/', query=query) + ct = self.get_single_result("content_type/", query=query) if not ct: log.critical("Could not fetch content type %s" % contenttype_name) return ct def get_file_purpose_url(self, slug): - return self.get_single_result('file_purpose/', - query={"slug": slug}, - field="url") + return self.get_single_result( + "file_purpose/", query={"slug": slug}, field="url" + ) def get_file_type(self, slug): - return self.get_single_result('file_type/', - field="url", - query={"slug":slug}) + return self.get_single_result("file_type/", field="url", query={"slug": slug}) - - def upload_directory_attachment(self, path, contenttype_name, object_id, file_purpose=None): + def upload_directory_attachment( + self, path, contenttype_name, object_id, file_purpose=None + ): path = os.path.abspath(path) if not (contenttype_name and object_id): - log.error("Cannot attach file %s without both content type and object_id" % path) + log.error( + "Cannot attach file %s without both content type and object_id" % path + ) return False contenttype = self.get_contenttype(contenttype_name) @@ -406,28 +511,33 @@ def upload_directory_attachment(self, path, contenttype_name, object_id, file_pu purpose = self.get_file_purpose_url(file_purpose) if file_purpose and not purpose: - log.error("Could not find file purpose %s for uploading directory %s" % (file_purpose, path)) + log.error( + "Could not find file purpose %s for uploading directory %s" + % (file_purpose, path) + ) return False elif purpose: log.debug("File purpose: %s" % purpose) - exists = self.get_single_result('directory/', query={"path":path}) + exists = self.get_single_result("directory/", query={"path": path}) if exists: data = exists else: data = {} - data.update({ - 'path': path, - 'content_type': contenttype['url'], - 'object_id': object_id, - 'purpose': purpose - }) + data.update( + { + "path": path, + "content_type": contenttype["url"], + "object_id": object_id, + "purpose": purpose, + } + ) if exists: log.info("Updating information for directory %s" % path) - result = self.put(url=data['url'], data=data) + result = self.put(url=data["url"], data=data) else: log.info("Uploading information for directory %s" % path) result = self.post("directory/", data=data) @@ -440,13 +550,26 @@ def upload_directory_attachment(self, path, contenttype_name, object_id, file_pu return True - def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose=None, file_type=None, skip_md5_check=False): + def upload_file_attachment( + self, + path, + contenttype_name, + object_id, + file_purpose=None, + file_type=None, + skip_md5_check=False, + ): path = os.path.abspath(path) - log.info("Attaching file %s to object %d (contenttype %s)" % (path, object_id, contenttype_name)) + log.info( + "Attaching file %s to object %d (contenttype %s)" + % (path, object_id, contenttype_name) + ) if not (contenttype_name and object_id): - log.error("Cannot attach file %s without both content type and object_id" % path) + log.error( + "Cannot attach file %s without both content type and object_id" % path + ) return False contenttype = self.get_contenttype(contenttype_name) @@ -458,7 +581,10 @@ def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose purpose = self.get_file_purpose_url(file_purpose) if file_purpose and not purpose: - log.error("Could not find file purpose %s for uploading file %s" % (file_purpose, path)) + log.error( + "Could not find file purpose %s for uploading file %s" + % (file_purpose, path) + ) return False elif purpose: log.debug("File Purpose: %s" % purpose) @@ -466,22 +592,31 @@ def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose ftype = self.get_file_type(file_type) if file_type and not ftype: - log.error("Could not find file type %s for uploading file %s" % (file_type, path)) + log.error( + "Could not find file type %s for uploading file %s" % (file_type, path) + ) return False elif purpose: log.debug("File Type: %s" % ftype) - exists = self.get_single_result("file/", - query={"object_id": object_id, - "purpose__slug": file_purpose, - "content_type": contenttype['id']}) + exists = self.get_single_result( + "file/", + query={ + "object_id": object_id, + "purpose__slug": file_purpose, + "content_type": contenttype["id"], + }, + ) file_size = os.path.getsize(path) last_modified = datetime.datetime.fromtimestamp(os.path.getmtime(path)) if skip_md5_check and exists and exists["size_bytes"] == file_size: - recorded_mtime = datetime.datetime.fromtimestamp(time.mktime(time.strptime( - exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S"))) + recorded_mtime = datetime.datetime.fromtimestamp( + time.mktime( + time.strptime(exists["file_last_modified"], "%Y-%m-%dT%H:%M:%S") + ) + ) # Allow for sloppiness in NFS timestamps difference = recorded_mtime - last_modified if timedelta(minutes=-1) <= difference <= timedelta(minutes=1): @@ -490,24 +625,27 @@ def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose md5sum = md5sum_file(path) - log.info("MD5sum: %s\tFile size: %d\tLast modified: %s" % (md5sum, file_size, str(last_modified))) + log.info( + "MD5sum: %s\tFile size: %d\tLast modified: %s" + % (md5sum, file_size, str(last_modified)) + ) data = { - 'path': path, - 'content_type': contenttype["url"], - 'object_id': object_id, - 'purpose': purpose, - 'filetype': ftype, - 'md5sum': md5sum, - 'file_last_modified': last_modified, - 'size_bytes': file_size, + "path": path, + "content_type": contenttype["url"], + "object_id": object_id, + "purpose": purpose, + "filetype": ftype, + "md5sum": md5sum, + "file_last_modified": last_modified, + "size_bytes": file_size, } log.debug(data) if exists: log.info("Updating information for file %s" % path) - result = self.put(url=exists['url'], data=data) + result = self.put(url=exists["url"], data=data) else: log.info("Uploading information for file %s" % path) result = self.post("file/", data=data) @@ -520,26 +658,31 @@ def upload_file_attachment(self, path, contenttype_name, object_id, file_purpose def get_flowcelllane_contenttype(self): if not self.flowcelllane_contenttype: - self.flowcelllane_contenttype = self.get_contenttype('SequencingData.flowcelllane') + self.flowcelllane_contenttype = self.get_contenttype( + "SequencingData.flowcelllane" + ) return self.flowcelllane_contenttype def get_alignment_contenttype(self): - self.alignment_contenttype = self.get_contenttype('SequencingData.flowcelllanealignment') + self.alignment_contenttype = self.get_contenttype( + "SequencingData.flowcelllanealignment" + ) return self.alignment_contenttype def get_aggregation_contenttype(self): - self.aggregation_contenttype = self.get_contenttype('AggregationData.aggregation') + self.aggregation_contenttype = self.get_contenttype( + "AggregationData.aggregation" + ) return self.aggregation_contenttype def create_count_type(self, name): - log.info("Creating count type %s" % name) is_mapq = name.startswith("mapq") is_samflag = name.startswith("samflag") is_alignment = "readlength" in name - is_chromosome = not(is_mapq or is_samflag or is_alignment) + is_chromosome = not (is_mapq or is_samflag or is_alignment) data = { "is_chromosome": is_chromosome, @@ -559,37 +702,40 @@ def create_count_type(self, name): log.warn("Could not create count type %s (%s)" % (name, str(result))) return self.count_types[name] - # TODO : make sure that no more of one count type exists def get_alignment_counts(self, alignment_id): - log.info("Getting alignment counts for %d" % alignment_id) if not alignment_id in self.alignment_counts: - counts = self.get_list_result('flowcell_lane_count/', - query={"alignment":alignment_id}) + counts = self.get_list_result( + "flowcell_lane_count/", query={"alignment": alignment_id} + ) if counts is None: log.critical("Could not get counts for ALN%d" % alignment_id) - self.alignment_counts[alignment_id] = dict([(count['count_type_name'], count) for count in counts]) + self.alignment_counts[alignment_id] = dict( + [(count["count_type_name"], count) for count in counts] + ) return self.alignment_counts[alignment_id] def get_flowcell_lane(self, flowcell_lane_id): - return self.get_by_id('flowcell_lane', flowcell_lane_id) + return self.get_by_id("flowcell_lane", flowcell_lane_id) def get_library(self, library_id): - return self.get_by_id('library', library_id) + return self.get_by_id("library", library_id) def get_aggregation(self, aggregation_id): - return self.get_by_id('aggregation', aggregation_id) + return self.get_by_id("aggregation", aggregation_id) def get_rna_metrics(self, alignment_id): - exists = self.get_single_result('rna_alignment_metrics/', query={"alignment": alignment_id}) + exists = self.get_single_result( + "rna_alignment_metrics/", query={"alignment": alignment_id} + ) if not exists: log.error("Error finding RNA metrics for alignment %d" % alignment_id) return exists def upload_rna_metrics(self, alignment_id, rna_file): - content = open(rna_file, 'r') + content = open(rna_file, "r") metrics = dict() for line in content: values = line.split() @@ -605,22 +751,25 @@ def upload_rna_metrics(self, alignment_id, rna_file): else: data = {} - data.update({ - "alignment": "%s/flowcell_lane_alignment/%d/" % (self.api.api_url, alignment_id), - "input_reads": metrics[r'input_reads'], - "mapped_reads": metrics[r'mapped'], - "percent_rRNA": metrics[r'%rRNA'], - "percent_duplicates": metrics[r'%duplicates'], - "exon_intron": metrics[r'exon:intron'], - "percent_intergenic": metrics[r'%intergenic'], - "percent_chrM": metrics[r'%chrM'], - "percent_correct_strand": metrics[r'%correct_strand'] - }) + data.update( + { + "alignment": "%s/flowcell_lane_alignment/%d/" + % (self.api.api_url, alignment_id), + "input_reads": metrics[r"input_reads"], + "mapped_reads": metrics[r"mapped"], + "percent_rRNA": metrics[r"%rRNA"], + "percent_duplicates": metrics[r"%duplicates"], + "exon_intron": metrics[r"exon:intron"], + "percent_intergenic": metrics[r"%intergenic"], + "percent_chrM": metrics[r"%chrM"], + "percent_correct_strand": metrics[r"%correct_strand"], + } + ) if exists: # Currently (2014-12-22) this will fail, but that's a TODO on the LIMS side. log.info("Updating RNA metrics for alignment ID %d" % alignment_id) - result = self.put(url=data['url'], data=data) + result = self.put(url=data["url"], data=data) else: log.info("Uploading RNA metrics for alignment ID %d" % alignment_id) result = self.post("rna_alignment_metrics/", data=data) @@ -629,28 +778,29 @@ def upload_rna_metrics(self, alignment_id, rna_file): log.error("Could not upload RNA stats") def upload_barcode_report(self, barcode_file): - datastring = open(barcode_file, 'r').read() + datastring = open(barcode_file, "r").read() try: jsondata = json.loads(datastring) except ValueError: log.error("Barcode report %s is not valid JSON" % barcode_file) return - if jsondata['Sequencer'] == 'MiniSeq': - print(jsondata['BaseDir']) - flowcell_label = re.search( '.*_[AB](000[A-Z0-9]{6}).*$', jsondata['BaseDir'] ).group(1) + if jsondata["Sequencer"] == "MiniSeq": + print(jsondata["BaseDir"]) + flowcell_label = re.search( + ".*_[AB](000[A-Z0-9]{6}).*$", jsondata["BaseDir"] + ).group(1) print(flowcell_label) else: # make this more flexible eventually - flowcell_label = re.search( '.*_[AB]([A-Z0-9]{9})$', jsondata['BaseDir'] ).group(1) + flowcell_label = re.search( + ".*_[AB]([A-Z0-9]{9})$", jsondata["BaseDir"] + ).group(1) flowcell_url = self.get_flowcell_url_by_label(flowcell_label) - data = { - "flowcell": flowcell_url, - "json_data": datastring - } + data = {"flowcell": flowcell_url, "json_data": datastring} # TODO: Don't upload redundant barcodes. result = self.post("barcode_report/", data=data) @@ -658,11 +808,13 @@ def upload_barcode_report(self, barcode_file): def bulk_upload_counts(self, alignment_id, stats): # TODO: This isn't ready yet. - data = [{ - "object_id": alignment_id, - "content_type": "flowcelllanealignment", - "stats": stats, - }] + data = [ + { + "object_id": alignment_id, + "content_type": "flowcelllanealignment", + "stats": stats, + } + ] response = self.api.post_single_result(url_addition="stat/create", json=data) return response @@ -670,32 +822,37 @@ def upload_counts(self, alignment_id, counts_file): parsed = self.parse_counts(counts_file) response = self.bulk_upload_counts(alignment_id, self.parse_counts(counts_file)) if response is None: - log.error("Bulk upload failed: Counts file {} for ALN{}".format(counts_file, alignment_id)) + log.error( + "Bulk upload failed: Counts file {} for ALN{}".format( + counts_file, alignment_id + ) + ) else: log.info("Upload successful.") return # TODO: Remove below code - #log.warn("Counts: %s", self.get_list_result( + # log.warn("Counts: %s", self.get_list_result( # 'flowcell_lane_count/', query={"alignment":alignment_id} - #)) + # )) existing_counts = { - count['count_type_name']: (count['count'], count['url']) + count["count_type_name"]: (count["count"], count["url"]) for count in self.get_list_result( - 'flowcell_lane_count/', query={"alignment":alignment_id} + "flowcell_lane_count/", query={"alignment": alignment_id} ) } - #log.warn("Count types: %s", self.get_list_result("flowcell_lane_count_type")) + # log.warn("Count types: %s", self.get_list_result("flowcell_lane_count_type")) lane_count_types = { - ct['codename']: ct['url'] + ct["codename"]: ct["url"] for ct in self.get_list_result("flowcell_lane_count_type") } - for (key, value) in parsed.items(): + for key, value in parsed.items(): try: data = { - "alignment": "%s/flowcell_lane_alignment/%d/" % (self.api.api_url, alignment_id), + "alignment": "%s/flowcell_lane_alignment/%d/" + % (self.api.api_url, alignment_id), "count_type": lane_count_types[key], "count": value, } @@ -716,10 +873,9 @@ def upload_counts(self, alignment_id, counts_file): ) # else we don't need to do anything - def parse_counts(self, counts_file): stats = {} - with open(counts_file, 'r') as counts: + with open(counts_file, "r") as counts: for line in counts: values = line.split() count_type_name = values[0] @@ -729,8 +885,14 @@ def parse_counts(self, counts_file): stats[count_type_name] = count return stats - def upload_alignment_records(self, alignment_id, adapter_file=None, version_file=None, start_time = False, complete_time = False): - + def upload_alignment_records( + self, + alignment_id, + adapter_file=None, + version_file=None, + start_time=False, + complete_time=False, + ): log.info("Uploading alignment records for %d" % alignment_id) if not (adapter_file or version_file or start_time or complete_time): @@ -740,10 +902,10 @@ def upload_alignment_records(self, alignment_id, adapter_file=None, version_file alignment = self.get_by_id("flowcell_lane_alignment", alignment_id) if version_file: - alignment["versions"] = open(version_file, 'r').read() + alignment["versions"] = open(version_file, "r").read() if adapter_file: - alignment["trim_adapters"] = open(adapter_file, 'r').read() + alignment["trim_adapters"] = open(adapter_file, "r").read() if start_time: alignment["start_time"] = datetime.datetime.now() @@ -751,18 +913,19 @@ def upload_alignment_records(self, alignment_id, adapter_file=None, version_file if complete_time: alignment["complete_time"] = datetime.datetime.now() - result = self.patch(url=alignment['url'], data=alignment) + result = self.patch(url=alignment["url"], data=alignment) if result: log.info("Alignment %d updated" % alignment_id) log.debug(result) else: - log.debug("No result for uploading %s to %s" % (str(alignment), alignment['url'])) + log.debug( + "No result for uploading %s to %s" % (str(alignment), alignment["url"]) + ) return True def upload_spot(self, alignment_id, spot_file, dup_file): - if not spot_file and dup_file: log.error("Error, do not have both files for alignment %s" % alignment_id) @@ -770,7 +933,8 @@ def upload_spot(self, alignment_id, spot_file, dup_file): percent_dup = get_dup_score(dup_file) data = { - "alignment": "%s/flowcell_lane_alignment/%d/" % (self.api.api_url, alignment_id) + "alignment": "%s/flowcell_lane_alignment/%d/" + % (self.api.api_url, alignment_id) } if spot_stats: @@ -780,8 +944,9 @@ def upload_spot(self, alignment_id, spot_file, dup_file): log.debug(data["percent_duplication"]) - origspots = self.get_list_result("flowcell_lane_spot/", - query={"alignment": alignment_id}) + origspots = self.get_list_result( + "flowcell_lane_spot/", query={"alignment": alignment_id} + ) if len(origspots) > 1: log.error("Could not figure out which SPOT score to upload to!") elif len(origspots) == 0: @@ -791,33 +956,37 @@ def upload_spot(self, alignment_id, spot_file, dup_file): log.error("Could not upload SPOT") else: origspot = origspots[0] - if (data["spot_score"] != origspot["spot_score"] + if ( + data["spot_score"] != origspot["spot_score"] or data["total_tags"] != origspot["total_tags"] or data["tags_in_hotspots"] != origspot["tags_in_hotspots"] or data["percent_duplication"] != origspot["percent_duplication"] ): log.info("Updating SPOT score for %d" % alignment_id) - result = self.patch(url=origspot['url'], data=data) + result = self.patch(url=origspot["url"], data=data) if not result: log.error("Could not upload SPOT") def get_fastqc_contents(self, filename): - - file_in_zip = "%s/fastqc_data.txt" % os.path.splitext(os.path.basename(filename))[0] + file_in_zip = ( + "%s/fastqc_data.txt" % os.path.splitext(os.path.basename(filename))[0] + ) with ZipFile(filename) as fastqc_zip: with fastqc_zip.open(file_in_zip) as fastqc_report: - return fastqc_report.read() + return fastqc_report.read() return None def upload_fastqc(self, flowcell_lane_id, filename): - if not self.fastqc_tags: self.fastqc_tags = self.get_fastqc_tags() if not self.flowcelllane_contenttype: self.flowcelllane_contenttype = self.get_flowcelllane_contenttype() - m = re.search(r'(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])_(?PR[12])', filename) + m = re.search( + r"(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])_(?PR[12])", + filename, + ) if not m: log.error("Could not figure out information for %s" % filename) @@ -828,11 +997,11 @@ def upload_fastqc(self, flowcell_lane_id, filename): fastqc_report = self.get_fastqc_contents(filename) if not fastqc_report: - log.error("Could not read fastqc report %s" % filename) - return False + log.error("Could not read fastqc report %s" % filename) + return False - samplename = m.group('samplename') - read = m.group('read') + samplename = m.group("samplename") + read = m.group("read") lane_info = self.get_flowcell_lane(flowcell_lane_id) @@ -843,33 +1012,40 @@ def upload_fastqc(self, flowcell_lane_id, filename): upload = dict() - upload['tags'] = [tag['url']] - upload['raw_data'] = fastqc_report - upload['content_type'] = self.flowcelllane_contenttype["url"] - upload['object_id'] = lane_info['id'] - upload['label'] = "FC%s %s %s %s %s" % (lane_info['flowcell_label'], samplename, str(lane_info["lane"]), lane_info["barcode_index"], read) + upload["tags"] = [tag["url"]] + upload["raw_data"] = fastqc_report + upload["content_type"] = self.flowcelllane_contenttype["url"] + upload["object_id"] = lane_info["id"] + upload["label"] = "FC%s %s %s %s %s" % ( + lane_info["flowcell_label"], + samplename, + str(lane_info["lane"]), + lane_info["barcode_index"], + read, + ) # does this report already exist? report = self.get_single_result( - 'fastqc_report/', + "fastqc_report/", query={ - "label": upload['label'], - "object_id": upload['object_id'], - "content_type": self.get_flowcelllane_contenttype()['id'] - }) + "label": upload["label"], + "object_id": upload["object_id"], + "content_type": self.get_flowcelllane_contenttype()["id"], + }, + ) if report: # replace content - if 'raw_data' not in report or report['raw_data'] != upload['raw_data']: - log.info("Updating report %s" % upload['label']) - result = self.patch(url=report['url'], data=upload) + if "raw_data" not in report or report["raw_data"] != upload["raw_data"]: + log.info("Updating report %s" % upload["label"]) + result = self.patch(url=report["url"], data=upload) if result: log.debug(result) else: - log.error("Could not update FastQC report %s" % report['url']) + log.error("Could not update FastQC report %s" % report["url"]) else: - log.info("Uploading new fastqc report %s" % upload['label']) + log.info("Uploading new fastqc report %s" % upload["label"]) result = self.post("fastqc_report/", data=upload) if result: @@ -878,9 +1054,10 @@ def upload_fastqc(self, flowcell_lane_id, filename): log.error("Could not upload new FastQC report") def upload_fastqc_counts(self, alignment_id): - if not alignment_id: - logging.critical("Could not upload fastqc_counts without an alignment id given") + logging.critical( + "Could not upload fastqc_counts without an alignment id given" + ) return self.get_alignment_counts(alignment_id) @@ -889,7 +1066,6 @@ def upload_fastqc_counts(self, alignment_id): filtered = 0 for fastqc_file, fastqc_counts in self.fastqc_counts.items(): - if not fastqc_counts: log.error("Could not get counts from %s for uploading" % fastqc_file) return @@ -898,23 +1074,20 @@ def upload_fastqc_counts(self, alignment_id): filtered += fastqc_counts["filtered"] # FastQC's definition of total differs from ours - counts = { - "total": total + filtered, - "qc": filtered, - "pf": total - } + counts = {"total": total + filtered, "qc": filtered, "pf": total} if not self.bulk_upload_counts(alignment_id, counts): log.error("Could not upload FastQC counts") - def upload_picard_metric(self, alignment_id, flowcell_lane_id, aggregation_id, filename, metric_name): - + def upload_picard_metric( + self, alignment_id, flowcell_lane_id, aggregation_id, filename, metric_name + ): if not self.picard_metrics: self.picard_metrics = self.get_picard_metrics() - picard_metric = None + picard_metric = None try: - picard_metric = open(filename, 'r').read() + picard_metric = open(filename, "r").read() except: log.error("Could not read picard metric file %s" % filename) return None @@ -938,9 +1111,13 @@ def upload_picard_metric(self, alignment_id, flowcell_lane_id, aggregation_id, f if not lane_info: return False - label = "FC%s %s %s %s %s" % (lane_info['flowcell_label'], - lane_info["samplesheet_name"], str(lane_info["lane"]), - lane_info["barcode_index"], metric_name) + label = "FC%s %s %s %s %s" % ( + lane_info["flowcell_label"], + lane_info["samplesheet_name"], + str(lane_info["lane"]), + lane_info["barcode_index"], + metric_name, + ) elif aggregation_id: object_id = aggregation_id if not self.aggregation_contenttype: @@ -948,41 +1125,50 @@ def upload_picard_metric(self, alignment_id, flowcell_lane_id, aggregation_id, f content_type = self.aggregation_contenttype aggregation_info = self.get_aggregation(aggregation_id) log.debug(aggregation_info) - library_info = self.get_by_full_url(aggregation_info['library']) + library_info = self.get_by_full_url(aggregation_info["library"]) if library_info: log.debug(library_info) else: - log.error("Could not fetch %s" % aggregation_info['library']) + log.error("Could not fetch %s" % aggregation_info["library"]) return False - label = "AGG%d LN%d %s" % (aggregation_id, library_info['number'], metric_name) + label = "AGG%d LN%d %s" % ( + aggregation_id, + library_info["number"], + metric_name, + ) # does this report already exist? log.debug("Checking for existing report...") existing = self.get_single_result( - 'picard_report/', + "picard_report/", query={ "object_id": object_id, - "content_type": content_type['id'], - "metric": metric['id'], + "content_type": content_type["id"], + "metric": metric["id"], "label": label, - }) + }, + ) - if existing and 'raw_data' in existing and existing['raw_data'] == picard_metric: + if ( + existing + and "raw_data" in existing + and existing["raw_data"] == picard_metric + ): log.info("Picard report is the same, not uploading") return upload = dict() - upload['metrics'] = [metric['url']] - upload['raw_data'] = picard_metric - upload['content_type'] = content_type["url"] - upload['object_id'] = object_id - upload['label'] = label + upload["metrics"] = [metric["url"]] + upload["raw_data"] = picard_metric + upload["content_type"] = content_type["url"] + upload["object_id"] = object_id + upload["label"] = label if existing is not None: - result = self.patch(url=existing['url'], json=upload) + result = self.patch(url=existing["url"], json=upload) else: - log.info("Uploading new picard report %s" % upload['label']) + log.info("Uploading new picard report %s" % upload["label"]) result = self.post("picard_report/", json=upload) if not result: @@ -990,9 +1176,10 @@ def upload_picard_metric(self, alignment_id, flowcell_lane_id, aggregation_id, f else: log.debug(result) -def main(args = sys.argv): + +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -1018,7 +1205,6 @@ def main(args = sys.argv): sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) - if not poptions.token and "LIMS_API_TOKEN" in os.environ: token = os.environ["LIMS_API_TOKEN"] elif poptions.token: @@ -1036,13 +1222,27 @@ def main(args = sys.argv): uploader.upload_fastqc_counts(poptions.alignment_id) if poptions.inserts_file: - uploader.upload_picard_metric(poptions.alignment_id, poptions.flowcell_lane_id, poptions.aggregation_id, poptions.inserts_file, "CollectInsertSizeMetrics") + uploader.upload_picard_metric( + poptions.alignment_id, + poptions.flowcell_lane_id, + poptions.aggregation_id, + poptions.inserts_file, + "CollectInsertSizeMetrics", + ) if poptions.dups_file: - uploader.upload_picard_metric(poptions.alignment_id, poptions.flowcell_lane_id, poptions.aggregation_id, poptions.dups_file, "MarkDuplicates") + uploader.upload_picard_metric( + poptions.alignment_id, + poptions.flowcell_lane_id, + poptions.aggregation_id, + poptions.dups_file, + "MarkDuplicates", + ) if poptions.spot_file or poptions.spot_dup_file: - uploader.upload_spot(poptions.alignment_id, poptions.spot_file, poptions.spot_dup_file) + uploader.upload_spot( + poptions.alignment_id, poptions.spot_file, poptions.spot_dup_file + ) if poptions.counts_file: uploader.upload_counts(poptions.alignment_id, poptions.counts_file) @@ -1056,10 +1256,19 @@ def main(args = sys.argv): if poptions.alignment_id and poptions.clear_align_stats: uploader.clear_alignment_stats(poptions.alignment_id) - if poptions.alignment_id and (poptions.version_file or poptions.adapter_file or poptions.align_start_time or poptions.align_complete_time): - uploader.upload_alignment_records(poptions.alignment_id, - version_file=poptions.version_file, adapter_file=poptions.adapter_file, - start_time=poptions.align_start_time, complete_time=poptions.align_complete_time) + if poptions.alignment_id and ( + poptions.version_file + or poptions.adapter_file + or poptions.align_start_time + or poptions.align_complete_time + ): + uploader.upload_alignment_records( + poptions.alignment_id, + version_file=poptions.version_file, + adapter_file=poptions.adapter_file, + start_time=poptions.align_start_time, + complete_time=poptions.align_complete_time, + ) if poptions.aggregation_id and poptions.clear_aggregation_stats: uploader.clear_aggregation_stats(poptions.aggregation_id) @@ -1074,12 +1283,23 @@ def main(args = sys.argv): uploader.clear_flowcell_cache(poptions.flowcell) if poptions.attach_file: - uploader.upload_file_attachment(poptions.attach_file, poptions.attach_file_contenttype, poptions.attach_file_objectid, - file_type=poptions.attach_file_type, file_purpose=poptions.attach_file_purpose, skip_md5_check=poptions.skip_md5_check) + uploader.upload_file_attachment( + poptions.attach_file, + poptions.attach_file_contenttype, + poptions.attach_file_objectid, + file_type=poptions.attach_file_type, + file_purpose=poptions.attach_file_purpose, + skip_md5_check=poptions.skip_md5_check, + ) if poptions.attach_directory: - uploader.upload_directory_attachment(poptions.attach_directory, poptions.attach_file_contenttype, poptions.attach_file_objectid, - file_purpose=poptions.attach_file_purpose) + uploader.upload_directory_attachment( + poptions.attach_directory, + poptions.attach_file_contenttype, + poptions.attach_file_objectid, + file_purpose=poptions.attach_file_purpose, + ) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index c94415d6..0c017e4c 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,4 +1,4 @@ -#import csv +# import csv import argparse import functools import json @@ -18,20 +18,27 @@ # Globals for storing our mapping (saves LIMS hits) POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} -LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} -LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} +LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} +LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} LANES_WITH_DIRECT_POOL = {} LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # Keys to copy directly from the api/locus/ endpoint -LOCUS_KEYS = ["genome_label", "chromosome_name", "genes", "name", - "genomic_feature", "genomic_coordinate_genome_label", - "genomic_coordinate_chromosome_name", "genomic_coordinate_start", - "genomic_coordinate_end"] - -STAMPIPES = os.getenv('STAMPIPES', '~/stampipes') +LOCUS_KEYS = [ + "genome_label", + "chromosome_name", + "genes", + "name", + "genomic_feature", + "genomic_coordinate_genome_label", + "genomic_coordinate_chromosome_name", + "genomic_coordinate_start", + "genomic_coordinate_end", +] + +STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") SCRIPT_OPTIONS = { "quiet": False, @@ -51,57 +58,112 @@ "auto_aggregate": False, } -def parser_setup(): +def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument("-a", "--api", dest="base_api_url", - help="The base API url, if not the default live LIMS.") - parser.add_argument("-t", "--token", dest="token", - help="Your authentication token. Required.") - - #parser.add_argument("--alignment", dest="align_ids", type=int, action="append", + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "-a", + "--api", + dest="base_api_url", + help="The base API url, if not the default live LIMS.", + ) + parser.add_argument( + "-t", "--token", dest="token", help="Your authentication token. Required." + ) + + # parser.add_argument("--alignment", dest="align_ids", type=int, action="append", # help="Run for this particular alignment.") - parser.add_argument("--flowcell", dest="flowcell_label", - help="Run for this particular flowcell label.") - #parser.add_argument("--pool", dest="pool", + parser.add_argument( + "--flowcell", + dest="flowcell_label", + help="Run for this particular flowcell label.", + ) + # parser.add_argument("--pool", dest="pool", # help="Run for this particular pool.") - #parser.add_argument("--tag", dest="tag", + # parser.add_argument("--tag", dest="tag", # help="Run for alignments tagged here.") - #parser.add_argument("--project", dest="project", + # parser.add_argument("--project", dest="project", # help="Run for alignments in this project.") - parser.add_argument("--script_template", dest="script_template", - help="The script template to use.") - parser.add_argument("--qsub_priority", dest="qsub_priority", type=int, - help="The priority to give scripts we are submitting.") - - parser.add_argument("-o", "--outfile", dest="outfile", - help="Append commands to run this alignment to this file.") - parser.add_argument("-b", "--sample-script-basename", dest="sample_script_basename", - help="Name of the script that goes after the sample name.") - parser.add_argument("--qsub-prefix", dest="qsub_prefix", - help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.") - parser.add_argument("--qsub-queue", dest="qsub_queue", - help="Name of the SLURM partition") - parser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", - help="Take no action, only print messages.") - parser.add_argument("--no-mask", dest="no_mask", action="store_true", - help="Don't use any barcode mask.") - parser.add_argument("--redo_completed", dest="redo_completed", action="store_true", - help="Redo alignments marked as completed.") - #parser.add_argument("--auto_aggregate", dest="auto_aggregate", help="Script created will also run auto-aggregations after alignments finished.", - #action="store_true") - parser.add_argument("--align_base_dir", dest="align_base_dir", - help="Create the alignment directory in this directory") - - parser.add_argument("--listout", dest="simple_output", action="store_true", - help="Write only a list of alignments to run, rather than a script to submit them") + parser.add_argument( + "--script_template", dest="script_template", help="The script template to use." + ) + parser.add_argument( + "--qsub_priority", + dest="qsub_priority", + type=int, + help="The priority to give scripts we are submitting.", + ) + + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + help="Append commands to run this alignment to this file.", + ) + parser.add_argument( + "-b", + "--sample-script-basename", + dest="sample_script_basename", + help="Name of the script that goes after the sample name.", + ) + parser.add_argument( + "--qsub-prefix", + dest="qsub_prefix", + help="Name of the qsub prefix in the qsub job name. Use a . in front to make it non-cluttery.", + ) + parser.add_argument( + "--qsub-queue", dest="qsub_queue", help="Name of the SLURM partition" + ) + parser.add_argument( + "-n", + "--dry-run", + dest="dry_run", + action="store_true", + help="Take no action, only print messages.", + ) + parser.add_argument( + "--no-mask", + dest="no_mask", + action="store_true", + help="Don't use any barcode mask.", + ) + parser.add_argument( + "--redo_completed", + dest="redo_completed", + action="store_true", + help="Redo alignments marked as completed.", + ) + # parser.add_argument("--auto_aggregate", dest="auto_aggregate", help="Script created will also run auto-aggregations after alignments finished.", + # action="store_true") + parser.add_argument( + "--align_base_dir", + dest="align_base_dir", + help="Create the alignment directory in this directory", + ) + + parser.add_argument( + "--listout", + dest="simple_output", + action="store_true", + help="Write only a list of alignments to run, rather than a script to submit them", + ) parser.set_defaults(**SCRIPT_OPTIONS) parser.set_defaults(quiet=False, debug=False) @@ -110,9 +172,7 @@ def parser_setup(): class ProcessSetUp(object): - def __init__(self, args, api_url, token): - self.token = token self.api_url = api_url self.qsub_scriptname = args.sample_script_basename @@ -124,21 +184,20 @@ def __init__(self, args, api_url, token): self.script_template = args.script_template self.qsub_priority = args.qsub_priority self.qsub_queue = args.qsub_queue - #self.auto_aggregate = args.auto_aggregate + # self.auto_aggregate = args.auto_aggregate self.align_base_dir = args.align_base_dir self.simple_output = args.simple_output self.session = requests.Session() - self.session.headers.update({'Authorization': "Token %s" % self.token}) + self.session.headers.update({"Authorization": "Token %s" % self.token}) self.pool = ThreadPoolExecutor(max_workers=10) @functools.lru_cache(maxsize=None) def api_single_result(self, url_addition=None, url=None): - if url_addition: - url = "%s/%s" % (self.api_url, url_addition) + url = "%s/%s" % (self.api_url, url_addition) request = self.session.get(url) @@ -152,7 +211,6 @@ def api_single_result(self, url_addition=None, url=None): @functools.lru_cache(maxsize=None) def api_list_result(self, url_addition=None, url=None): - more = True results = [] @@ -160,7 +218,6 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) request = self.session.get(url) @@ -178,18 +235,20 @@ def api_list_result(self, url_addition=None, url=None): return results def get_align_process_info(self, alignment_id): - - process_info = self.api_single_result("flowcell_lane_alignment/%d/processing_information/" % alignment_id) + process_info = self.api_single_result( + "flowcell_lane_alignment/%d/processing_information/" % alignment_id + ) if not process_info: - logging.critical("Could not find processing info for alignment %d\n" % alignment_id) + logging.critical( + "Could not find processing info for alignment %d\n" % alignment_id + ) logging.critical(process_info) sys.exit(1) return process_info def get_process_template(self, align_id, process_template_id): - if not process_template_id: logging.critical("No process template for alignment %d\n" % align_id) return None @@ -197,7 +256,9 @@ def get_process_template(self, align_id, process_template_id): info = self.api_single_result("process_template/%d/" % (process_template_id)) if not info: - logging.critical("Could not find processing template for ID %d\n" % process_template_id) + logging.critical( + "Could not find processing template for ID %d\n" % process_template_id + ) sys.exit(1) return info @@ -214,20 +275,23 @@ def setup_alignments(self, align_ids, parallel=True): else: logging.debug("ALN%d result received, OK" % id) if not all_okay: - #logging.critical("Errors during setup, exiting") - logging.error("Errors during setup, but continuing with other alignments") + # logging.critical("Errors during setup, exiting") + logging.error( + "Errors during setup, but continuing with other alignments" + ) # Sequential version, helpful for debugging else: for aln_id in align_ids: self.setup_alignment(aln_id) def setup_alignment(self, align_id): - try: processing_info = self.get_align_process_info(align_id) - alignment = self.api_single_result("flowcell_lane_alignment/%d/" % (align_id)) + alignment = self.api_single_result( + "flowcell_lane_alignment/%d/" % (align_id) + ) - if self.redo_completed or not alignment['complete_time']: + if self.redo_completed or not alignment["complete_time"]: self.create_script(processing_info, alignment["id"]) return (align_id, None) else: @@ -238,7 +302,9 @@ def setup_alignment(self, align_id): return (align_id, e) def get_lane_file(self, lane_id, purpose): - candidates = self.api_list_result("file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id)) + candidates = self.api_list_result( + "file/?content_type=40&purpose__slug=%s&object_id=%d" % (purpose, lane_id) + ) if not candidates: return None @@ -248,14 +314,17 @@ def get_lane_file(self, lane_id, purpose): return candidates[0] def setup_tag(self, tag_slug): - - align_tags = self.api_list_result("tagged_object/?content_type=47&tag__slug=%s" % tag_slug) + align_tags = self.api_list_result( + "tagged_object/?content_type=47&tag__slug=%s" % tag_slug + ) self.setup_alignments([align_tag["object_id"] for align_tag in align_tags]) def setup_project(self, project_id): logging.info("Setting up project #%s" % project_id) - alignments = self.api_list_result("flowcell_lane_alignment/?lane__sample__project=%s" % project_id) + alignments = self.api_list_result( + "flowcell_lane_alignment/?lane__sample__project=%s" % project_id + ) self.setup_alignments([alignment["id"] for alignment in alignments]) def setup_flowcell(self, flowcell_label): @@ -263,7 +332,7 @@ def setup_flowcell(self, flowcell_label): align_ids = self.get_alignment_ids(flowcell_label) logging.debug("align ids: %s", align_ids) - #alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + # alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) # Disable parallelism so that caching works self.setup_alignments(align_ids, parallel=False) self.add_stats_upload(flowcell_label) @@ -278,7 +347,7 @@ def get_alignment_ids(self, flowcell_label: str) -> [int]: def extract_id_from_url(url): if url is None: return None - return int(re.findall(r'\d+', url)[-1]) + return int(re.findall(r"\d+", url)[-1]) # Storage for the 3 layers of mapping between alignments and pools global POOL_KEY_TO_LIB_IDS @@ -287,82 +356,92 @@ def extract_id_from_url(url): global LANES_WITH_DIRECT_POOL POOL_KEY_TO_LIB_IDS = defaultdict(list) # {(pool_id, lane_number): [lib_id]} - LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} - LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} + LIB_ID_TO_LANE_IDS = defaultdict(list) # {lib_id: [lane_ids]} + LANE_ID_TO_ALN_IDS = defaultdict(list) # {lane_id: [aln_ids]} library_info = set() - for lane in self.api_list_result("flowcell_lane/?flowcell__label=%s&page_size=1000" % flowcell_label): - lib_url = lane['library'] - lane_lane = lane['lane'] + for lane in self.api_list_result( + "flowcell_lane/?flowcell__label=%s&page_size=1000" % flowcell_label + ): + lib_url = lane["library"] + lane_lane = lane["lane"] if lib_url is not None: library_info.add((lib_url, lane_lane)) lib_id = extract_id_from_url(lib_url) - LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) + LIB_ID_TO_LANE_IDS[lib_id].append(lane["id"]) else: # HACKS BELOW # Get pool manually - pool_url = lane['library_pool'] + pool_url = lane["library_pool"] pool_id = extract_id_from_url(pool_url) - LANES_WITH_DIRECT_POOL[lane['id']] = pool_id + LANES_WITH_DIRECT_POOL[lane["id"]] = pool_id pool_key = (pool_id, lane_lane) - pool_number = int(lane['library_pool__number']) + pool_number = int(lane["library_pool__number"]) # Get Library info lp_info = self.api_single_result(url=pool_url) - sl_info = self.api_single_result(url=lp_info['sublibrary']) - cl_info = self.api_single_result(url=sl_info['cell_library']) - lib_ids = [extract_id_from_url(lib_url) for lib_url in cl_info["libraries"]] + sl_info = self.api_single_result(url=lp_info["sublibrary"]) + cl_info = self.api_single_result(url=sl_info["cell_library"]) + lib_ids = [ + extract_id_from_url(lib_url) for lib_url in cl_info["libraries"] + ] for lib_url in cl_info["libraries"]: library_info.add((lib_url, lane_lane)) for lib_id in lib_ids: POOL_KEY_TO_LIB_IDS[pool_key].append(lib_id) - LIB_ID_TO_LANE_IDS[lib_id].append(lane['id']) - + LIB_ID_TO_LANE_IDS[lib_id].append(lane["id"]) # Set of poolnums + lane pool_info = set() for info in library_info: lib_info = self.api_single_result(url=info[0]) - for pool in lib_info['librarypools']: + for pool in lib_info["librarypools"]: pool_info.add((pool["number"], info[1])) key = (pool["id"], info[1]) - POOL_KEY_TO_LIB_IDS[key].append(lib_info['id']) + POOL_KEY_TO_LIB_IDS[key].append(lib_info["id"]) - all_alignments = self.api_list_result("flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label) + all_alignments = self.api_list_result( + "flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" + % flowcell_label + ) direct_alns = set() for aln in all_alignments: - lane_id = extract_id_from_url(aln['lane']) - LANE_ID_TO_ALN_IDS[lane_id].append(aln['id']) + lane_id = extract_id_from_url(aln["lane"]) + LANE_ID_TO_ALN_IDS[lane_id].append(aln["id"]) if lane_id in LANES_WITH_DIRECT_POOL: - direct_alns.add(aln['id']) - + direct_alns.add(aln["id"]) # Find the minimum alignment ID for each pool/lane combination - lowest_aln_for_pool = {pool_key: None for pool_key in POOL_KEY_TO_LIB_IDS.keys()} - for (pool_key, lib_ids) in POOL_KEY_TO_LIB_IDS.items(): + lowest_aln_for_pool = { + pool_key: None for pool_key in POOL_KEY_TO_LIB_IDS.keys() + } + for pool_key, lib_ids in POOL_KEY_TO_LIB_IDS.items(): for lib_id in lib_ids: for lane_id in LIB_ID_TO_LANE_IDS[lib_id]: for aln_id in LANE_ID_TO_ALN_IDS[lane_id]: cur_aln = lowest_aln_for_pool[pool_key] - logging.debug("%s, %d, %d, %d < %s?", - pool_key, lib_id, lane_id, aln_id, cur_aln) + logging.debug( + "%s, %d, %d, %d < %s?", + pool_key, + lib_id, + lane_id, + aln_id, + cur_aln, + ) if cur_aln is None or cur_aln > aln_id: lowest_aln_for_pool[pool_key] = aln_id - align_ids = set(lowest_aln_for_pool.values()).union( direct_alns ) + align_ids = set(lowest_aln_for_pool.values()).union(direct_alns) logging.debug("POOL_KEY_TO_LIB_IDS %s", POOL_KEY_TO_LIB_IDS) logging.debug("LIB_ID_TO_LANE_IDS %s", LIB_ID_TO_LANE_IDS) logging.debug("LANE_ID_TO_ALN_IDS %s", LANE_ID_TO_ALN_IDS) logging.debug("ALN IDS %s", align_ids) return list(align_ids) - - - - #def auto_aggregation_script(self,flowcell_label,alignments): + # def auto_aggregation_script(self,flowcell_label,alignments): # aaname_sentinel = "auto_agg_sentinel.%s" % (flowcell_label) # if not self.outfile: @@ -390,7 +469,6 @@ def extract_id_from_url(url): # outfile.close() def add_script(self, align_id, processing_info, script_file, sample_name): - ram_megabytes = 4000 if not self.outfile: @@ -398,17 +476,31 @@ def add_script(self, align_id, processing_info, script_file, sample_name): outfile = sys.stdout else: logging.debug("Logging script to %s" % self.outfile) - outfile = open(self.outfile, 'a') + outfile = open(self.outfile, "a") if self.simple_output: outfile.write(script_file + "\n") else: outfile.write("cd %s && " % os.path.dirname(script_file)) - fullname = "%s%s-%s-ALIGN#%d" % (self.qsub_prefix,sample_name,processing_info['flowcell']['label'],align_id) - outfile.write("jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING=\"$PROCESSING,$jobid\"\n\n" % (fullname, fullname, fullname, self.qsub_queue, ram_megabytes, script_file)) + fullname = "%s%s-%s-ALIGN#%d" % ( + self.qsub_prefix, + sample_name, + processing_info["flowcell"]["label"], + align_id, + ) + outfile.write( + 'jobid=$(sbatch --export=ALL -J %s -o %s.o%%A -e %s.e%%A --partition=%s --cpus-per-task=1 --ntasks=1 --mem-per-cpu=%d --parsable --oversubscribe <<__ALIGNPROC__\n#!/bin/bash\nbash %s\n__ALIGNPROC__\n)\nPROCESSING="$PROCESSING,$jobid"\n\n' + % ( + fullname, + fullname, + fullname, + self.qsub_queue, + ram_megabytes, + script_file, + ) + ) outfile.close() - def add_stats_upload(self, flowcell_label): job_name = ".upload-altcode-%s" % flowcell_label template = textwrap.dedent( @@ -418,23 +510,25 @@ def add_stats_upload(self, flowcell_label): sbatch --export=ALL -J {job_name} -o {job_name}.o%A -e {job_name}.e%A --partition={queue} --cpus-per-task=1 --ntasks=1 $sentinel_dependencies --mem-per-cpu=1000 --parsable --oversubscribe <<__UPLOAD_POOL_DATA__ #!/bin/bash python $STAMPIPES/scripts/altcode/upload_stats.py "$PWD" - __UPLOAD_POOL_DATA__""") + __UPLOAD_POOL_DATA__""" + ) content = template.format( label=flowcell_label, job_name=job_name, queue=self.qsub_queue, ) - with open(self.outfile, 'a') as outfile: - outfile.write(content) + with open(self.outfile, "a") as outfile: + outfile.write(content) def get_script_template(self, process_template): - if self.script_template: script_path = self.script_template else: - script_path = os.path.expandvars(process_template["process_version"]["script_location"]) - return open(script_path, 'r').read() + script_path = os.path.expandvars( + process_template["process_version"]["script_location"] + ) + return open(script_path, "r").read() def create_script(self, processing_info, align_id): logging.debug("Creating script for ALN%d", align_id) @@ -446,39 +540,58 @@ def create_script(self, processing_info, align_id): logging.error("Alignment %d has no process template" % align_id) return False - process_template = self.get_process_template(align_id, alignment["process_template"]) + process_template = self.get_process_template( + align_id, alignment["process_template"] + ) if not process_template: return False - flowcell_directory = processing_info['flowcell']['directory'] + flowcell_directory = processing_info["flowcell"]["directory"] share_dir = lane.get("project_share_directory") if share_dir: flowcell_directory = os.path.join(share_dir, "alignments") if not flowcell_directory: - logging.error("Alignment %d has no flowcell directory for flowcell %s" % (align_id, processing_info['flowcell']['label'])) + logging.error( + "Alignment %d has no flowcell directory for flowcell %s" + % (align_id, processing_info["flowcell"]["label"]) + ) return False - if lane.get('library'): - lib_info_response = self.api_single_result("library/?number=%d" % int(lane["library"]))["results"] + if lane.get("library"): + lib_info_response = self.api_single_result( + "library/?number=%d" % int(lane["library"]) + )["results"] assert len(lib_info_response) == 1 lib_info = lib_info_response[0] logging.debug("lib info is %s", lib_info) pool_name = lib_info["librarypools"][0]["object_name"] logging.debug("pool is %s", pool_name) else: - pool_name = lane['samplesheet_name'] + pool_name = lane["samplesheet_name"] - fastq_directory = os.path.join(flowcell_directory, "Project_%s" % lane['project'], "LibraryPool_%s" % pool_name) + fastq_directory = os.path.join( + flowcell_directory, + "Project_%s" % lane["project"], + "LibraryPool_%s" % pool_name, + ) # Reset the alignment's sample name if we decied not to use the barcode index mask if self.no_mask: - alignment['sample_name'] = "%s_%s_L00%d" % (lane['samplesheet_name'], lane['barcode_index'], lane['lane']) - - align_dir = "align_%d_%s_%s" % (alignment['id'], alignment['genome_index'], alignment['aligner']) - if alignment['aligner_version']: - align_dir = "%s-%s" % (align_dir, alignment['aligner_version']) + alignment["sample_name"] = "%s_%s_L00%d" % ( + lane["samplesheet_name"], + lane["barcode_index"], + lane["lane"], + ) + + align_dir = "align_%d_%s_%s" % ( + alignment["id"], + alignment["genome_index"], + alignment["aligner"], + ) + if alignment["aligner_version"]: + align_dir = "%s-%s" % (align_dir, alignment["aligner_version"]) script_directory = os.path.join(fastq_directory, align_dir) if self.align_base_dir: @@ -487,89 +600,109 @@ def create_script(self, processing_info, align_id): r1_fastq = self.get_lane_file(lane["id"], "r1-fastq") if not r1_fastq: - logging.error("Missing r1-fastq for lane %d (alignment %d) - check dir %s" % (lane["id"], alignment["id"], fastq_directory)) + logging.error( + "Missing r1-fastq for lane %d (alignment %d) - check dir %s" + % (lane["id"], alignment["id"], fastq_directory) + ) return False - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: r2_fastq = self.get_lane_file(lane["id"], "r2-fastq") if not r2_fastq: - logging.error("Missing r2-fastq for lane %d (alignment %d)" % (lane["id"], alignment["id"])) + logging.error( + "Missing r2-fastq for lane %d (alignment %d)" + % (lane["id"], alignment["id"]) + ) return False - script_file = os.path.join( script_directory, "%s-%s" % (alignment['sample_name'], self.qsub_scriptname) ) + script_file = os.path.join( + script_directory, "%s-%s" % (alignment["sample_name"], self.qsub_scriptname) + ) logging.info("Will write to %s" % script_file) - # Set up & add environment variables env_vars = OrderedDict() - env_vars["SAMPLE_NAME"] = alignment['sample_name'] - env_vars["BWAINDEX"] = alignment['genome_index_location'] - env_vars["GENOME"] = alignment['genome_index'] - env_vars["ASSAY"] = lane['assay'] - env_vars["READLENGTH"] = processing_info['flowcell']['read_length'] + env_vars["SAMPLE_NAME"] = alignment["sample_name"] + env_vars["BWAINDEX"] = alignment["genome_index_location"] + env_vars["GENOME"] = alignment["genome_index"] + env_vars["ASSAY"] = lane["assay"] + env_vars["READLENGTH"] = processing_info["flowcell"]["read_length"] try: - env_vars["LIBRARY_KIT"] = '"' + processing_info['libraries'][0]['library_kit_method'] + '"' + env_vars["LIBRARY_KIT"] = ( + '"' + processing_info["libraries"][0]["library_kit_method"] + '"' + ) except: env_vars["LIBRARY_KIT"] = None - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: env_vars["PAIRED"] = "True" else: env_vars["PAIRED"] = None - env_vars["FLOWCELL_LANE_ID"] = lane['id'] - env_vars["ALIGNMENT_ID"] = alignment['id'] - env_vars["ALIGN_DIR"] = os.path.join(fastq_directory, align_dir) - env_vars["R1_FASTQ"] = r1_fastq["path"] + env_vars["FLOWCELL_LANE_ID"] = lane["id"] + env_vars["ALIGNMENT_ID"] = alignment["id"] + env_vars["ALIGN_DIR"] = os.path.join(fastq_directory, align_dir) + env_vars["R1_FASTQ"] = r1_fastq["path"] - if processing_info['flowcell']['paired_end']: + if processing_info["flowcell"]["paired_end"]: env_vars["R2_FASTQ"] = r2_fastq["path"] env_vars["FASTQ_DIR"] = fastq_directory - env_vars["FLOWCELL"] = processing_info['flowcell']['label'] + env_vars["FLOWCELL"] = processing_info["flowcell"]["label"] if "barcode1" in lane and lane["barcode1"]: - p7_adapter = lane['barcode1']['adapter7'] - p5_adapter = lane['barcode1']['adapter5'] - if "barcode2" in lane and lane['barcode2']: + p7_adapter = lane["barcode1"]["adapter7"] + p5_adapter = lane["barcode1"]["adapter5"] + if "barcode2" in lane and lane["barcode2"]: # Override the "default" end adapter from barcode1 - p5_adapter = lane['barcode2']['adapter5_reverse_complement'] + p5_adapter = lane["barcode2"]["adapter5_reverse_complement"] if not p7_adapter or not p5_adapter: - logging.warn("Alignment %d missing adapters, some processes might not work" % alignment['id']) + logging.warn( + "Alignment %d missing adapters, some processes might not work" + % alignment["id"] + ) env_vars["ADAPTER_P7"] = p7_adapter env_vars["ADAPTER_P5"] = p5_adapter # Process with UMI if the barcode has one and this is a dual index # flowcell - if lane['barcode1']['umi'] and processing_info['flowcell']['dual_index']: + if lane["barcode1"]["umi"] and processing_info["flowcell"]["dual_index"]: env_vars["UMI"] = "True" else: env_vars["UMI"] = None - env_vars["UMI_METHOD"] = lane['barcode1']['umi_method'] + env_vars["UMI_METHOD"] = lane["barcode1"]["umi_method"] # Set process template env var overrides - if 'process_variables' in process_template and process_template['process_variables']: + if ( + "process_variables" in process_template + and process_template["process_variables"] + ): try: - process_template_variables = json.loads(process_template['process_variables'], - object_pairs_hook=OrderedDict) + process_template_variables = json.loads( + process_template["process_variables"], object_pairs_hook=OrderedDict + ) for var, value in process_template_variables.items(): env_vars[var] = value except ValueError as e: - logging.error("Could not parse process variables for align %d (template %d): '%s'" % - ( - alignment['id'], - process_template['id'], - process_template['process_variables'] - )) + logging.error( + "Could not parse process variables for align %d (template %d): '%s'" + % ( + alignment["id"], + process_template["id"], + process_template["process_variables"], + ) + ) return False if self.dry_run: logging.info("Dry run, would have created: %s" % script_file) logging.debug(env_vars) - self.create_sample_config(processing_info, alignment, script_directory, pool_name) + self.create_sample_config( + processing_info, alignment, script_directory, pool_name + ) return True if not os.path.exists(script_directory): @@ -577,10 +710,12 @@ def create_script(self, processing_info, align_id): os.makedirs(script_directory) # Append to master script - self.add_script(align_id, processing_info, script_file, alignment['sample_name']) + self.add_script( + align_id, processing_info, script_file, alignment["sample_name"] + ) # Write file - outfile = open(script_file, 'w') + outfile = open(script_file, "w") outfile.write("set -e -o pipefail\n") # Set env vars @@ -597,38 +732,41 @@ def create_script(self, processing_info, align_id): outfile.close() # Create the config file as well - self.create_sample_config(processing_info, alignment, script_directory, pool_name) + self.create_sample_config( + processing_info, alignment, script_directory, pool_name + ) - def create_sample_config(self, processing_info, alignment, script_directory, pool_name): + def create_sample_config( + self, processing_info, alignment, script_directory, pool_name + ): alignment_id = int(alignment["id"]) logging.debug("Creating sample config for ALN%d", alignment_id) def get_libraries_in_pool(alignment_id): - # Get all lane ids # Go up to the pool then down to the lanes # Note: This is inefficient but probably doesnt matter in practice lanes = [] lanes_with_align = set() - for (lane_id, aln_ids) in LANE_ID_TO_ALN_IDS.items(): + for lane_id, aln_ids in LANE_ID_TO_ALN_IDS.items(): if alignment_id in aln_ids: lanes_with_align.add(lane_id) assert len(lanes_with_align) == 1, "Alignment must have exactly 1 lane" align_lane_id = lanes_with_align.pop() libs_with_align = set() - for (lib_id, lane_ids) in LIB_ID_TO_LANE_IDS.items(): + for lib_id, lane_ids in LIB_ID_TO_LANE_IDS.items(): if align_lane_id in lane_ids: libs_with_align.add(lib_id) - #assert len(libs_with_align) == 1, "Lane must have exactly 1 library" + # assert len(libs_with_align) == 1, "Lane must have exactly 1 library" align_lib_id = libs_with_align.pop() pools_with_align = set() - for (pool_key, lib_ids) in POOL_KEY_TO_LIB_IDS.items(): + for pool_key, lib_ids in POOL_KEY_TO_LIB_IDS.items(): if align_lib_id in lib_ids: pools_with_align.add(pool_key) # TODO: This is broken because the pool can be in more than one lane!!! - #assert len(pools_with_align) == 1, "Lib must have exactly one pool" + # assert len(pools_with_align) == 1, "Lib must have exactly one pool" align_poolkey = pools_with_align.pop() logging.debug("Alignment ALN%d - poolkey %s", alignment_id, align_poolkey) @@ -640,6 +778,7 @@ def get_libraries_in_pool(alignment_id): def build_library_info(lib_id, flowcell_label): errors = [] + def add_error(fmt, *args): err_msg = fmt % args errors.append(err_msg) @@ -655,14 +794,22 @@ def add_error(fmt, *args): barcode += bc2 sample_info = self.api_single_result(url=lib_info["sample"]) - project_info = self.api_single_result(url=sample_info["project"]) if sample_info.get("project") else {"name": None} - - taggedobject_infos = self.api_list_result("tagged_object/?object_id=%d&content_type=%d" - % (lib_info["id"], lib_info["object_content_type"])) + project_info = ( + self.api_single_result(url=sample_info["project"]) + if sample_info.get("project") + else {"name": None} + ) + + taggedobject_infos = self.api_list_result( + "tagged_object/?object_id=%d&content_type=%d" + % (lib_info["id"], lib_info["object_content_type"]) + ) cycle = None for taggedobject_info in taggedobject_infos: # TODO: It may be better to check membership in the Insights tag - if taggedobject_info["tag_slug"].startswith("megamap-run-mmap") or taggedobject_info["tag_slug"].startswith("epicapdev-run-ecd"): + if taggedobject_info["tag_slug"].startswith( + "megamap-run-mmap" + ) or taggedobject_info["tag_slug"].startswith("epicapdev-run-ecd"): if cycle is None: tag_slug = str(taggedobject_info["tag_slug"]) match = re.search(r"\d+$", tag_slug) @@ -685,7 +832,6 @@ def build_effector_info(effectortopool): "strand": eff["strand"], "working_name": eff["working_name"], "talen": talen, - "n_terminus": { "name": eff["effector__n_terminus__name"], "nucleotide": eff["effector__n_terminus__nucleotide"], @@ -708,7 +854,7 @@ def build_effector_info(effectortopool): "well": well["label"], } for well in eff["plate_wells"] - ] + ], } pool_info = [] @@ -717,7 +863,9 @@ def build_effector_info(effectortopool): try: tc_info = self.api_single_result(url=sample_info["tissue_culture"]) for effector_pool in tc_info["effector_pools"]: - effector_pool_info = self.api_single_result(url=effector_pool["url"]) + effector_pool_info = self.api_single_result( + url=effector_pool["url"] + ) loci_info = [] if effector_pool_info.get("loci", False): for locus_url in effector_pool_info["loci"]: @@ -729,18 +877,25 @@ def build_effector_info(effectortopool): locus_dict[key] = locus_info.get(key, None) loci_info.append(locus_dict) - pool_info.append({ - "effector_pool": effector_pool_info["object_name"], - "name": effector_pool_info["name"], - "purpose": effector_pool_info["purpose__name"], - "loci": loci_info, - "effectors": [ - build_effector_info(efftopool) - for efftopool in effector_pool_info["effectortopool_set"] - ], - }) + pool_info.append( + { + "effector_pool": effector_pool_info["object_name"], + "name": effector_pool_info["name"], + "purpose": effector_pool_info["purpose__name"], + "loci": loci_info, + "effectors": [ + build_effector_info(efftopool) + for efftopool in effector_pool_info[ + "effectortopool_set" + ] + ], + } + ) except: - add_error("Could not get effector information for sample DS%s", sample_info['number']) + add_error( + "Could not get effector information for sample DS%s", + sample_info["number"], + ) def extract_lenti_from_tc_notes(notes): def match_notes(regex): @@ -748,6 +903,7 @@ def match_notes(regex): if match is None: return None return match.group(1) + # Example notes field below: # Talen Number: TL120935 # Original TALE name: IL2RA-TL52068-Z @@ -786,6 +942,7 @@ def info_to_data(well_info): parent_info = self.api_single_result(url=well_info["parent"]) well_data["well_parent"] = info_to_data(parent_info) return well_data + wells = [] for well in sample_info["plate_wells"]: well_info = self.api_single_result("plate_well/%d/" % well["id"]) @@ -796,7 +953,7 @@ def info_to_data(well_info): def reverse_complement(bc: "Optional[str]") -> "Optional[str]": if bc is None: return None - lookup = {"A":"T", "T":"A", "C":"G", "G":"C"} + lookup = {"A": "T", "T": "A", "C": "G", "G": "C"} return "".join(reversed([lookup[c] for c in bc])) lenti_from_tc = extract_lenti_from_tc_notes(tc_info["notes"]) @@ -831,17 +988,25 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": tc_well_label = None tc_well_plate = None try: - seq_well_label = lib_plate_wells[0]["well_label"] - seq_well_plate = "PL%d" % lib_plate_wells[0]["plate_id"] + seq_well_label = lib_plate_wells[0]["well_label"] + seq_well_plate = "PL%d" % lib_plate_wells[0]["plate_id"] sample_well_label = lib_plate_wells[0]["well_parent"]["well_label"] - sample_well_plate = "PL%d" % lib_plate_wells[0]["well_parent"]["plate_id"] - tc_well_label = lib_plate_wells[0]["well_parent"]["well_parent"]["well_label"] - tc_well_plate = "PL%d" % lib_plate_wells[0]["well_parent"]["well_parent"]["plate_id"] + sample_well_plate = ( + "PL%d" % lib_plate_wells[0]["well_parent"]["plate_id"] + ) + tc_well_label = lib_plate_wells[0]["well_parent"]["well_parent"][ + "well_label" + ] + tc_well_plate = ( + "PL%d" + % lib_plate_wells[0]["well_parent"]["well_parent"]["plate_id"] + ) except Exception as e: add_error("Could not find well info in %s", lib_plate_wells) def sort_talens(tls): - """ Sort talens by number """ + """Sort talens by number""" + def get_num(tl): match = re.search(r"TL(\d+)", tl) if match: @@ -849,10 +1014,11 @@ def get_num(tl): else: logging.warning("Weird talen: '%s'" % tl) return 0 + return sorted(tls, key=get_num) if pool_info: - #talen_name = None #TODO + # talen_name = None #TODO talen_names = [] for pool in deep_info["effector_pools"]: for effector in pool.get("effectors", []): @@ -908,10 +1074,14 @@ def get_sbl_and_cl(pool_name): try: m = re.match(r"LP(\d+)", pool_name) if not m: - add_error("Pool name '%s' not valid, can't get SBL&CL", pool_name) + add_error( + "Pool name '%s' not valid, can't get SBL&CL", pool_name + ) return (None, None) pool_id = int(m.group(1)) - pool_info = self.api_list_result("library_pool/?number=%d" % pool_id)[0] + pool_info = self.api_list_result( + "library_pool/?number=%d" % pool_id + )[0] if not pool_info.get("sublibrary"): return (None, None) sbl_info = self.api_single_result(url=pool_info.get("sublibrary")) @@ -923,6 +1093,7 @@ def get_sbl_and_cl(pool_name): except Exception as e: add_error("Error finding SBL or CL: %s", e) return (sbl, cl) + (sbl_name, cl_name) = get_sbl_and_cl(pool_name) info = { @@ -949,13 +1120,11 @@ def get_sbl_and_cl(pool_name): "TL#_new": lenti_from_tc["talen_number"], "sample_barcode": reverse_complement(bc2), "lenti_qc_passed": True, - "script_errors": errors, "additional_information": deep_info, } return info - flowcell_label = "FC%s" % processing_info["flowcell"]["label"] libraries = [] @@ -964,20 +1133,22 @@ def get_sbl_and_cl(pool_name): data = {"libraries": libraries} if self.dry_run: - logging.info("dry_run, would have written %s/pool_info.json", script_directory) + logging.info( + "dry_run, would have written %s/pool_info.json", script_directory + ) return # do stuff with open("%s/pool_info.json" % script_directory, "w") as out: json.dump(data, out, indent=2, sort_keys=True) - #writer = csv.DictWriter(out, fieldnames=fieldnames, dialect="excel-tab", restval="") - #writer.writeheader() - #for row in rows: - #writer.writerow(row) + # writer = csv.DictWriter(out, fieldnames=fieldnames, dialect="excel-tab", restval="") + # writer.writeheader() + # for row in rows: + # writer.writerow(row) -def main(args = sys.argv): +def main(args=sys.argv): """This is the main body of the program that by default uses the arguments -from the command line.""" + from the command line.""" parser = parser_setup() poptions = parser.parse_args() @@ -1009,17 +1180,17 @@ def main(args = sys.argv): process = ProcessSetUp(poptions, api_url, token) - #process.setup_alignments(poptions.align_ids) + # process.setup_alignments(poptions.align_ids) if poptions.flowcell_label: process.setup_flowcell(poptions.flowcell_label) else: logging.critical("Non-flowcell setup not yet supported") - #if poptions.tag: + # if poptions.tag: # process.setup_tag(poptions.tag) - #if poptions.project: + # if poptions.project: # process.setup_project(poptions.project) diff --git a/scripts/umi/extract_umt.py b/scripts/umi/extract_umt.py index 36858590..9d101e95 100755 --- a/scripts/umi/extract_umt.py +++ b/scripts/umi/extract_umt.py @@ -18,14 +18,14 @@ def parseArgs(): - parser = argparse.ArgumentParser( - description='Annotate read names with UMT') - parser.add_argument('--mismatches', type=int, default=1, - help='number of mismatches') - parser.add_argument('r1_fastq') - parser.add_argument('r2_fastq') - parser.add_argument('out_r1') - parser.add_argument('out_r2') + parser = argparse.ArgumentParser(description="Annotate read names with UMT") + parser.add_argument( + "--mismatches", type=int, default=1, help="number of mismatches" + ) + parser.add_argument("r1_fastq") + parser.add_argument("r2_fastq") + parser.add_argument("out_r1") + parser.add_argument("out_r2") args = parser.parse_args() return args @@ -50,7 +50,7 @@ def mismatch(word, mismatches): # mismatched_stems, stem_lengths, and UMI_LEN def find_stem_len(read): for len in stem_lengths: - if str(read.seq[UMI_LEN:len + UMI_LEN]) in mismatched_stems: + if str(read.seq[UMI_LEN : len + UMI_LEN]) in mismatched_stems: return len return 0 @@ -78,12 +78,12 @@ def attach_umt(r1, r2): # Check for presence of UMT in mate - this indicates a short fragment that needs trimmed # Save stem & UMT for trimming use - stem1 = r1[:UMI_LEN + r1_len] - stem2 = r2[:UMI_LEN + r2_len] + stem1 = r1[: UMI_LEN + r1_len] + stem2 = r2[: UMI_LEN + r2_len] # Trim UMT & stem out of start of read - r1 = r1[UMI_LEN + r1_len:] - r2 = r2[UMI_LEN + r2_len:] + r1 = r1[UMI_LEN + r1_len :] + r2 = r2[UMI_LEN + r2_len :] # Trim ends, if necessary rev_stem1 = str(stem1.seq.reverse_complement()) @@ -98,11 +98,14 @@ def attach_umt(r1, r2): r2 = r2[:-x2] return (r1, r2) # Check specifically for "we didn't trim off one base of adapter sequence" - x1 = UMI_LEN+r2_len - x2 = UMI_LEN+r1_len - if str_r1[-x1-1:-1] == rev_stem2[:x1] and str_r2[-x2-1:-1] == rev_stem1[:x2]: - r1 = r1[:-x1-1] - r2 = r2[:-x2-1] + x1 = UMI_LEN + r2_len + x2 = UMI_LEN + r1_len + if ( + str_r1[-x1 - 1 : -1] == rev_stem2[:x1] + and str_r2[-x2 - 1 : -1] == rev_stem1[:x2] + ): + r1 = r1[: -x1 - 1] + r2 = r2[: -x2 - 1] return (r1, r2) @@ -119,11 +122,9 @@ def main(argv): logging.basicConfig(level=logging.WARN, format=log_format) setup_mismatches(args.mismatches) - with open(args.r1_fastq) as r1_in, \ - open(args.r2_fastq) as r2_in, \ - open(args.out_r1, 'wt') as r1_out, \ - open(args.out_r2, 'wt') as r2_out: - + with open(args.r1_fastq) as r1_in, open(args.r2_fastq) as r2_in, open( + args.out_r1, "wt" + ) as r1_out, open(args.out_r2, "wt") as r2_out: r1_seqIO = SeqIO.parse(r1_in, "fastq") r2_seqIO = SeqIO.parse(r2_in, "fastq") try: @@ -136,5 +137,6 @@ def main(argv): except StopIteration: logging.info("EOF reached") + if __name__ == "__main__": main(sys.argv) diff --git a/scripts/umi/fastq_umi_add.py b/scripts/umi/fastq_umi_add.py index c6f34afc..413a15f2 100644 --- a/scripts/umi/fastq_umi_add.py +++ b/scripts/umi/fastq_umi_add.py @@ -3,32 +3,31 @@ import sys import gzip -def transform_line(line): +def transform_line(line): line = line.strip() if line.startswith("@"): - - insert_loc = line.find(' ') - umi_loc = line.find('+') + 1 + insert_loc = line.find(" ") + umi_loc = line.find("+") + 1 umi = line[umi_loc:] if umi_loc else "" line = "%s#%s%s" % (line[:insert_loc], umi, line[insert_loc:]) return line -def transform_file(infilename, outfilename): - outfile = gzip.open(outfilename, 'wt') - infile = gzip.open(infilename, 'rt') +def transform_file(infilename, outfilename): + outfile = gzip.open(outfilename, "wt") + infile = gzip.open(infilename, "rt") for line in infile: - outfile.write( transform_line(line) ) - outfile.write('\n') + outfile.write(transform_line(line)) + outfile.write("\n") outfile.close() infile.close() -if __name__ == "__main__": +if __name__ == "__main__": transform_file(sys.argv[1], sys.argv[2]) diff --git a/scripts/utility/lorentz.py b/scripts/utility/lorentz.py index cdb6ddf1..a87fccae 100755 --- a/scripts/utility/lorentz.py +++ b/scripts/utility/lorentz.py @@ -24,8 +24,8 @@ def lorentz(seen): width += w height += delta_h if delta_h > slope and robinhood_pos == 0: - robinhood_pos = (width-1) / total_width - robinhood_idx = (slope*(width-1) - height) / total_height + robinhood_pos = (width - 1) / total_width + robinhood_idx = (slope * (width - 1) - height) / total_height area += w * (height - delta_h / 2.0) ideal = total_width * total_height / 2.0 @@ -46,8 +46,9 @@ def main(): seen[int(count)] = int(times) # Print results - for (key, value) in sorted(lorentz(seen).items()): + for key, value in sorted(lorentz(seen).items()): print("%s\t%.4f" % (key, value)) + if __name__ == "__main__": main() diff --git a/scripts/utility/md5check.py b/scripts/utility/md5check.py index fde28cce..b2fc6b9e 100644 --- a/scripts/utility/md5check.py +++ b/scripts/utility/md5check.py @@ -5,26 +5,34 @@ Check that each file matches the needed MD5SUM """ + import logging import subprocess import sys infile = sys.argv[1] + def check_md5sum(filename, md5sum): logging.debug("Checking file %s matches %s" % (filename, md5sum)) - current_md5sum = subprocess.check_output(["md5sum", filename], stderr=subprocess.STDOUT, universal_newlines=True).split()[0] + current_md5sum = subprocess.check_output( + ["md5sum", filename], stderr=subprocess.STDOUT, universal_newlines=True + ).split()[0] match = md5sum == current_md5sum if not match: - logging.error("md5sum for file %s does not match: %s recorded, %s as exists" % (filename, md5sum, current_md5sum)) + logging.error( + "md5sum for file %s does not match: %s recorded, %s as exists" + % (filename, md5sum, current_md5sum) + ) return match + MATCHING = True -with open(infile, 'r') as filelist: +with open(infile, "r") as filelist: for line in filelist: filename, md5sum = line.strip().split() MATCHING = check_md5sum(filename, md5sum) and MATCHING diff --git a/scripts/utility/movesymlinks.py b/scripts/utility/movesymlinks.py index f1311f4b..54010b54 100644 --- a/scripts/utility/movesymlinks.py +++ b/scripts/utility/movesymlinks.py @@ -13,24 +13,46 @@ def parser_setup(): parser = argparse.ArgumentParser() - parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", - help="Don't print info messages to standard out.") - parser.add_argument("-d", "--debug", dest="debug", action="store_true", - help="Print all debug messages to standard out.") - - parser.add_argument('--fromdir', default=os.getcwd(), - help="The directory we're changing all the symlinks in, defaults to current working directory") - parser.add_argument('--olddir', required=True, - help="The old directory we don't want symlinks going to") - parser.add_argument('--newdir', required=True, - help="The new base directory the symlinks should go to") - parser.add_argument('--report', help="The outfile to write down all symlinks", default=None) - parser.add_argument("--move", dest="move", action="store_true", - help="Actually perform the move") + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + help="Don't print info messages to standard out.", + ) + parser.add_argument( + "-d", + "--debug", + dest="debug", + action="store_true", + help="Print all debug messages to standard out.", + ) + + parser.add_argument( + "--fromdir", + default=os.getcwd(), + help="The directory we're changing all the symlinks in, defaults to current working directory", + ) + parser.add_argument( + "--olddir", + required=True, + help="The old directory we don't want symlinks going to", + ) + parser.add_argument( + "--newdir", + required=True, + help="The new base directory the symlinks should go to", + ) + parser.add_argument( + "--report", help="The outfile to write down all symlinks", default=None + ) + parser.add_argument( + "--move", dest="move", action="store_true", help="Actually perform the move" + ) return parser + class SymlinkMover(object): - def __init__(self, fromdir, olddir, newdir, move=False, report=None): self.fromdir = fromdir self.olddir = olddir @@ -44,8 +66,8 @@ def detect(self, path): path = path.rstrip("/") if not os.path.islink(path): - logging.debug("%s not a symlink" % path) - return + logging.debug("%s not a symlink" % path) + return target_path = os.readlink(path) broken = False @@ -64,13 +86,19 @@ def detect(self, path): self.brokenlinks.append(path) if self.report: - self.report.write("%s\t%s\t%s\t%s\n" % (path, target_path, target_path_absolute, str(broken))) + self.report.write( + "%s\t%s\t%s\t%s\n" + % (path, target_path, target_path_absolute, str(broken)) + ) def move_link(self, linkpath): old_target_path = os.readlink(linkpath) new_target_path = old_target_path.replace(self.olddir, self.newdir) - logging.info("Moving %s pointer from %s to %s" % (linkpath, old_target_path, new_target_path)) + logging.info( + "Moving %s pointer from %s to %s" + % (linkpath, old_target_path, new_target_path) + ) try: if self.domove: os.unlink(linkpath) @@ -80,7 +108,7 @@ def move_link(self, linkpath): def walk(self, directory): for root, dirs, files in os.walk(directory): - if root.startswith('./.git'): + if root.startswith("./.git"): # Ignore the .git directory. continue logging.debug("walking through directories for %s" % root) @@ -89,9 +117,8 @@ def walk(self, directory): [self.detect(os.path.join(root, filename)) for filename in files] def run(self, report=None): - if report: - self.report = open(report, 'w') + self.report = open(report, "w") else: self.report = None @@ -110,8 +137,8 @@ def run(self, report=None): if self.report: self.report.close() -def main(args=sys.argv): +def main(args=sys.argv): parser = parser_setup() args = parser.parse_args() @@ -126,5 +153,6 @@ def main(args=sys.argv): mover = SymlinkMover(args.fromdir, args.olddir, args.newdir, args.move) mover.run(report=args.report) + if __name__ == "__main__": main() diff --git a/scripts/utility/picard_inserts_process.py b/scripts/utility/picard_inserts_process.py index 7fa871bf..49687c21 100644 --- a/scripts/utility/picard_inserts_process.py +++ b/scripts/utility/picard_inserts_process.py @@ -4,6 +4,7 @@ from datetime import datetime from sklearn.preprocessing import normalize + # returns the convolutional or statistical autocorrelation of the passed histogram # unused/untested currently def get_autocorr(hist, mode="conv"): @@ -14,14 +15,16 @@ def get_autocorr(hist, mode="conv"): x[int(k)] = v if mode == "conv": - result = numpy.correlate(x, x, mode='same') + result = numpy.correlate(x, x, mode="same") if mode == "stat": - result = numpy.array([1]+[numpy.corrcoef(x[:-i], x[i:])[0,1] \ - for i in range(1, len(x))]) + result = numpy.array( + [1] + [numpy.corrcoef(x[:-i], x[i:])[0, 1] for i in range(1, len(x))] + ) return result + # accepts a histogram as a dictionary of index-value pairs and returns it as an array # fills in all unseen values as 0 def get_arr(hist): @@ -30,6 +33,7 @@ def get_arr(hist): x[int(k)] = v return x + # returns a normalized a histogram # mode=max uses the maximum value (highest peak) as the normalization factor # mode=sum uses the sum of all values as the normalization factor @@ -38,49 +42,53 @@ def normalize_hist(hist, mode="max"): if mode not in ["max", "sum"]: raise ValueError("'mode' argument must be 'max' or 'sum'") if mode == "max": - result = hist.astype(float)/float(max(hist)) + result = hist.astype(float) / float(max(hist)) if mode == "sum": - result = hist.astype(float)/float(sum(hist)) + result = hist.astype(float) / float(sum(hist)) return result + # returns the normalized diff of a copy of a passed histogram # unused/untested currently -def diff_norm(df, idx, hist_name="agg_hist", mode="max", r=(50,140)): +def diff_norm(df, idx, hist_name="agg_hist", mode="max", r=(50, 140)): try: - c = copy(get_arr(df[hist_name].iloc[idx])[r[0]:r[1]]) + c = copy(get_arr(df[hist_name].iloc[idx])[r[0] : r[1]]) except: - c = copy(df[hist_name].iloc[idx][r[0]:r[1]]) + c = copy(df[hist_name].iloc[idx][r[0] : r[1]]) return numpy.diff(normalize_hist(c, mode=mode)) + # accepts a histogram and returns a sorted array of the n highest peaks in a given range # unused/untested currently -def get_peak_loc(hist, r=(100,106), npeaks=1): +def get_peak_loc(hist, r=(100, 106), npeaks=1): peaks = [] try: - n_highest = sorted(hist[r[0]:r[1]], reverse=True)[:npeaks] + n_highest = sorted(hist[r[0] : r[1]], reverse=True)[:npeaks] for n in n_highest: peaks.append(hist.index(n)) except: peaks = numpy.nan return peaks -def get_large_small_ratio(hist,r=(60,120)): + +def get_large_small_ratio(hist, r=(60, 120)): ratio = numpy.nan hist = get_arr(hist) try: - rt = (hist[r[1]])/(hist[r[0]]) + rt = (hist[r[1]]) / (hist[r[0]]) ratio = rt except: ratio = numpy.nan return ratio + # calculate: # pa = peak at x # pv = fft value at pa # vat = value at idx 11 # adp = mean-adjusted peak at x # adv = mean-adjusted value at idx 11 -def f_transform(hist,position=11): +def f_transform(hist, position=11): pa = numpy.nan pv = numpy.nan vat = numpy.nan @@ -89,7 +97,7 @@ def f_transform(hist,position=11): try: norm_hist = normalize_hist(get_arr(hist), mode="max") mx = list(norm_hist).index(max(norm_hist)) - norm_hist = norm_hist[mx:mx+105] + norm_hist = norm_hist[mx : mx + 105] ft = numpy.fft.rfft(numpy.diff(norm_hist)) m = numpy.argmax(ft) adj = numpy.mean(ft) @@ -106,20 +114,21 @@ def f_transform(hist,position=11): adv = numpy.nan return adv + def read_hist(file): d = {} with open(file) as f: for line in f: line = line.rstrip() try: - (key,val) = line.split() + (key, val) = line.split() d[int(key)] = int(val) except: next return d -def main(): +def main(): inputfile = sys.argv[1] hist = read_hist(inputfile) fourier_transform_eleven = f_transform(hist) @@ -128,6 +137,7 @@ def main(): print("insert-ls-ratio\t%.4f" % (large_small_ratio)) print("insert-ft-eleven\t%.4f" % (fourier_transform_eleven.real)) + # This is the main body of the program that only runs when running this script # doesn't run when imported, so you can use the functions above in the shell after importing # without automatically running it diff --git a/scripts/versions.py b/scripts/versions.py index eb9d61ec..f62520dd 100644 --- a/scripts/versions.py +++ b/scripts/versions.py @@ -5,7 +5,8 @@ import pip installed_packages = pip.get_installed_distributions() -installed_packages_list = sorted(["%s==%s" % (i.key, i.version) - for i in installed_packages]) +installed_packages_list = sorted( + ["%s==%s" % (i.key, i.version) for i in installed_packages] +) print(installed_packages_list) From a42333fa949fafafe8e6c70a304aa57b2e25962d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 10:20:37 -0700 Subject: [PATCH 149/172] style: Fix lint errors from ruff --- processes/altcode/bin/summarize_stats.py | 4 +- processes/altseq/bin/analyze.py | 1 - processes/altseq/bin/generate_counts_json.py | 1 - processes/bwa/aggregate/plot_footprints.py | 3 +- pyproject.toml | 51 ++++++++++++------- scripts/aggregatecollate.py | 19 +++---- scripts/aggregateprocess.py | 17 ++++--- scripts/alignprocess.py | 4 +- scripts/altcode/upload_fastq.py | 12 ++--- scripts/altcode/upload_stats.py | 9 ++-- scripts/altseq/upload_data.py | 8 +-- scripts/apilaneprocess.py | 10 ++-- scripts/bam/mark_dups.py | 14 ++--- scripts/bam/random_reads.py | 1 - .../browser/make_trackhubs_for_flowcell.py | 27 +++++----- .../browser/make_trackhubs_for_projects.py | 13 +++-- .../make_browser_load.py | 29 +++++------ scripts/browser/parse_all_projects.py | 10 ++-- scripts/bwa/aggregate/basic/sparse_motifs.py | 3 +- scripts/bwa/bamcounts.py | 5 +- scripts/bwa/filter_reads.py | 16 +++--- scripts/bwa/fix_bam_pairing.py | 7 ++- scripts/cluster/monitor_alignments.py | 11 ++-- scripts/copy_notify.py | 9 +++- scripts/create_processing.py | 16 +++--- scripts/flowcells/barcode_check.py | 6 +-- .../barcode_count_from_stats_file.py | 1 - scripts/flowcells/barcode_report.py | 41 ++++++++------- scripts/flowcells/demux_fastq.py | 2 - scripts/flowcells/link_rapidrun.py | 7 +-- scripts/flowcells/make_samplesheets.py | 4 -- scripts/flowcells/max_mismatch.py | 7 +-- scripts/helpers/expand_multiple_alignments.py | 6 +-- scripts/laneprocess.py | 3 +- scripts/lims/aggregation/get_files.py | 9 ++-- scripts/lims/alignment/get_files.py | 11 ++-- scripts/lims/create_altseq_sample_config.py | 3 -- scripts/lims/get_processing.py | 10 ++-- scripts/lims/movetag.py | 13 +++-- scripts/lims/upload_aggregation_stats.py | 20 ++------ scripts/lims/upload_data.py | 13 +++-- scripts/poolprocess.py | 17 +++---- scripts/umi/extract_umt.py | 3 +- scripts/utility/movesymlinks.py | 5 +- scripts/utility/picard_inserts_process.py | 27 ++++------ 45 files changed, 242 insertions(+), 266 deletions(-) diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index 64ffa9e5..28dfc550 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -6,8 +6,6 @@ import logging import math import os -import pathlib -import pprint import re from collections import defaultdict @@ -37,7 +35,7 @@ def parse_cellreads(filename): for k, v in row.items(): try: row[k] = int(v) - except: + except Exception: pass data.append(row) return data diff --git a/processes/altseq/bin/analyze.py b/processes/altseq/bin/analyze.py index f585c01b..e1ca590b 100755 --- a/processes/altseq/bin/analyze.py +++ b/processes/altseq/bin/analyze.py @@ -4,7 +4,6 @@ import csv import os import pathlib -import pprint def parse_args(): diff --git a/processes/altseq/bin/generate_counts_json.py b/processes/altseq/bin/generate_counts_json.py index 7b092470..5952bebf 100755 --- a/processes/altseq/bin/generate_counts_json.py +++ b/processes/altseq/bin/generate_counts_json.py @@ -3,7 +3,6 @@ import argparse import csv import os -import pathlib # import pprint import json diff --git a/processes/bwa/aggregate/plot_footprints.py b/processes/bwa/aggregate/plot_footprints.py index 45cb598c..09622358 100755 --- a/processes/bwa/aggregate/plot_footprints.py +++ b/processes/bwa/aggregate/plot_footprints.py @@ -4,11 +4,10 @@ # In[2]: -import sys, os +import sys import numpy as np import scipy import scipy.stats -from scipy import optimize from footprint_tools.modeling import dispersion diff --git a/pyproject.toml b/pyproject.toml index 254a3f88..1479e977 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,37 @@ -[tool.pyright] -include = ["scripts"] -exclude = ["**/__pycache__"] -#ignore = ["src/oldstuff"] -defineConstant = { STAMPIPES = "/home/nelsonjs/code/stampipes" } -#stubPath = "src/stubs" -#venv = "env367" +[project] +name = "stampipes" +version = "0.1.0" +description = "Add your description here" +authors = [ + { name = "Jemma Nelson", email = "nelsonjs@altiusinstitute.org" }, + { name = "Audra Johnson", email = "audrakj@altiusinstitute.org" }, + { name = "Richard Sandstrom", email = "rsandstrom@altiusinstitute.org" }, +] +dependencies = [] +readme = "README.md" +requires-python = ">= 3.5" -reportMissingImports = true -reportMissingTypeStubs = false +[tool.ruff] +# py37 is the lowest version +target-version = "py37" -pythonVersion = "3.6" -pythonPlatform = "Linux" +[tool.ruff.lint] +ignore = [] -executionEnvironments = [ - #{ root = "src/web", pythonVersion = "3.5", pythonPlatform = "Windows", extraPaths = [ "src/service_libs" ] }, - #{ root = "src/sdk", pythonVersion = "3.0", extraPaths = [ "src/backend" ] }, - #{ root = "src/tests", extraPaths = ["src/tests/e2e", "src/sdk" ]}, - #{ root = "src" } -] + +#[build-system] +#requires = ["hatchling"] +#build-backend = "hatchling.build" +# +#[tool.rye] +#managed = true +#dev-dependencies = [ +# "ruff" +#] +# +#[tool.hatch.metadata] +#allow-direct-references = true +# +#[tool.hatch.build.targets.wheel] +#packages = ["src/stampipes"] +# diff --git a/scripts/aggregatecollate.py b/scripts/aggregatecollate.py index fa77ff39..bad9c687 100644 --- a/scripts/aggregatecollate.py +++ b/scripts/aggregatecollate.py @@ -1,16 +1,12 @@ -import json import os import sys import argparse import logging -import requests -import subprocess sys.path.append("/home/audrakj/stamlims_api") print(sys.path) - -from stamlims_api import rest -from stamlims_api.lims import files +from stamlims_api import rest # noqa: E402 +from stamlims_api.lims import files # noqa: E402 log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -209,8 +205,13 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): if aggregation_lane["include"]: included = aggregation_lane break - - lane = self.api.single_result(url=aggregation_lane["lane"]) + if included is None: + logging.error( + "No included aggregation lanes for aggregation %s", aggregation_id + ) + lane = None + else: + lane = self.api.single_result(url=included["lane"]) if not lane: logging.critical( @@ -301,7 +302,7 @@ def setup_aggregation(self, aggregation_id): % (lane_id, aggregation_id) ) - alignment_id = int(alignment_endpoint.strip("/").split("/")[-1]) + # alignment_id = int(alignment_endpoint.strip("/").split("/")[-1]) r1_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, "r1-fastq") r2_fastq = self.get_lane_fastq_file(aggregation_id, lane_id, "r2-fastq") diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index f2bb1fe1..af62b9ae 100644 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -374,7 +374,7 @@ def get_genome_index_location(self, aggregation_id, aggregation_lanes): included = aggregation_lane break - if not "alignment" in aggregation_lane or not aggregation_lane["alignment"]: + if "alignment" not in aggregation_lane or not aggregation_lane["alignment"]: logging.critical( "No alignment set for included aggregation lane %s" % str(aggregation_lane) @@ -438,8 +438,13 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): if aggregation_lane["include"]: included = aggregation_lane break - - lane = self.api_single_result(url=aggregation_lane["lane"]) + if included is None: + logging.error( + "No included aggregation lanes for aggregation %s", aggregation_id + ) + lane = None + else: + lane = self.api.single_result(url=included["lane"]) if not lane: logging.critical( @@ -484,7 +489,7 @@ def get_category_for_assay(self, assay_url): assay_info = self.api_single_result(url=assay_url) category_url = assay_info["category"] if category_url is None: - logging.warn("Assay %s has no category" % (assay_name)) + logging.warn("Assay %s has no category" % (assay_info)) return None category_info = self.api_single_result(url=category_url) return category_info["slug"] @@ -663,7 +668,7 @@ def setup_aggregation(self, aggregation_id): ) for var, value in process_template_variables.items(): env_vars[var] = value - except ValueError as e: + except ValueError: logging.error( "Could not parse process variables for aggregation %d (template %d): '%s'" % ( @@ -686,7 +691,7 @@ def setup_aggregation(self, aggregation_id): try: os.makedirs(aggregation_folder) - except: + except Exception: pass file_record = open("%s/bamfiles.txt" % aggregation_folder, "w") diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 1ec5cb24..90ba5f91 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -388,7 +388,7 @@ def create_script(self, processing_info, align_id): lane = processing_info["libraries"][0] alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] - if not "process_template" in alignment: + if "process_template" not in alignment: logging.error("Alignment %d has no process template" % align_id) return False @@ -530,7 +530,7 @@ def create_script(self, processing_info, align_id): ) for var, value in process_template_variables.items(): env_vars[var] = value - except ValueError as e: + except ValueError: logging.error( "Could not parse process variables for align %d (template %d): '%s'" % ( diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index 40f06cfe..e2fde5a2 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -3,18 +3,14 @@ Uploads alt-code fastq files to LIMS """ -import pprint import re -import csv import argparse import datetime import hashlib -import json import logging import os import sys from functools import lru_cache -from collections import defaultdict # Make sure we can load our vendored stamlims_api dependency sys.path.insert( @@ -502,11 +498,11 @@ def extract_id_from_url(url): lane_info = self.get_list_result(lanes_query) lanes_in_pool = set() lanes_in_pool.add(int(lane_id)) - for l in lane_info: - if l.get("library"): - library_id = extract_id_from_url(l["library"]) + for lane in lane_info: + if lane.get("library"): + library_id = extract_id_from_url(lane["library"]) if library_id in lib_ids: - lanes_in_pool.add(l["id"]) + lanes_in_pool.add(lane["id"]) return list(lanes_in_pool) # def upload_flowcell_report(self, data): diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py index b1a58a44..d7042d49 100644 --- a/scripts/altcode/upload_stats.py +++ b/scripts/altcode/upload_stats.py @@ -3,9 +3,7 @@ Uploads all the results of alt-code processing to LIMS """ -import pprint import re -import csv import argparse import datetime import hashlib @@ -541,9 +539,10 @@ def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): LOG.debug("idx=%s, lane=%d, name=%s", idx, lane, name) # Get lane IDs for each file lane_ids = [ - l["id"] - for l in processing_dict["libraries"] - if l["barcode1"]["reverse_sequence"] == idx and int(l["lane"]) == lane + lane["id"] + for lane in processing_dict["libraries"] + if lane["barcode1"]["reverse_sequence"] == idx + and int(lane["lane"]) == lane ] r1_file = os.path.join(outdir, name, "R1.fq.gz") r2_file = os.path.join(outdir, name, "R2.fq.gz") diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index eb2bbfec..ccaf448d 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -3,7 +3,6 @@ Uploads all the results of alt-seq processing to LIMS """ -import pprint import re import csv import argparse @@ -536,9 +535,10 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): LOG.debug("idx=%s, lane=%d, name=%s", idx, lane, name) # Get lane IDs for each file lane_ids = [ - l["id"] - for l in processing_dict["libraries"] - if l["barcode1"]["reverse_sequence"] == idx and int(l["lane"]) == lane + lib["id"] + for lib in processing_dict["libraries"] + if lib["barcode1"]["reverse_sequence"] == idx + and int(lib["lane"]) == lane ] r1_file = os.path.join(outdir, name, "R1.fq.gz") r2_file = os.path.join(outdir, name, "R2.fq.gz") diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 2c15e376..cd5a540d 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -1,10 +1,8 @@ -import json import os import sys import argparse import logging import re -import requests import collections try: @@ -12,7 +10,7 @@ except ImportError: from futures import ThreadPoolExecutor -from stamlims_api import rest +from stamlims_api import rest, lims log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -170,7 +168,7 @@ def setup_flowcell(self, flowcell_label): self.setup_lanes([lane["id"] for lane in lanes]) def setup_tag(self, tag_slug): - flowcelllane_contenttype = content_types.contenttype_from_model_name( + flowcelllane_contenttype = lims.content_types.contenttype_from_model_name( self.api, model_name="FlowcellLane" ) lane_tags = self.api.get_list_result( @@ -235,7 +233,7 @@ def setup_lane(self, lane_id): logging.debug("Lane %d is pool %s", lib_number, pool_name) else: logging.debug("Lane %d is not pool", lib_number) - except: + except Exception: pass global POOL_INFO @@ -291,7 +289,7 @@ def get_script_template(self): def create_script(self, processing_info, pool=None): lane = processing_info["libraries"][0] - if not "directory" in lane: + if "directory" not in lane: logging.critical("No directory for lane %d" % lane["id"]) return False fastq_directory = lane["directory"] diff --git a/scripts/bam/mark_dups.py b/scripts/bam/mark_dups.py index 578dcd53..acd8bd7f 100755 --- a/scripts/bam/mark_dups.py +++ b/scripts/bam/mark_dups.py @@ -54,7 +54,7 @@ def find_dups(reads): key = r.template_length lists[key].append(r) - return [sorted(l, key=sortQuality) for l in lists.values()] + return [sorted(sublist, key=sortQuality) for sublist in lists.values()] # Sets a read's duplicate flag, returns it @@ -90,19 +90,19 @@ def process_chunk(self, chunk): read_sets = find_dups(new_reads) - for l in read_sets: + for read_set in read_sets: if self.histo is not None: - self.read_histo[len(l)] += 1 + self.read_histo[len(read_set)] += 1 # Mark all but the highest-quality read as duplicates - l[0] = set_dup(l[0], False) - self.pair_map[l[0].query_name] = False - for r in l[1:]: + read_set[0] = set_dup(read_set[0], False) + self.pair_map[read_set[0].query_name] = False + for r in read_set[1:]: r = set_dup(r, True) self.pair_map[r.query_name] = True # Print the read set if self.output is not None: - for r in l: + for r in read_set: self.output.write(r) # Print out already seen reads diff --git a/scripts/bam/random_reads.py b/scripts/bam/random_reads.py index 1d02c038..d67fe777 100644 --- a/scripts/bam/random_reads.py +++ b/scripts/bam/random_reads.py @@ -3,7 +3,6 @@ def main(): import random import pysam import shutil - import datetime parser = argparse.ArgumentParser() parser.add_argument("infile") diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index 7d5c40fe..bb48ce77 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -6,7 +6,6 @@ import argparse import logging import re -import copy import requests log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -32,7 +31,7 @@ def foldercheck(*args): try: os.mkdir(folder) util_log.info("Created folder: %s" % folder) - except OSError as x: + except OSError: util_log.error("ERROR: Could not create directory: %s" % folder) util_log.warn( "Please make sure all nonexistant parent directories have been created." @@ -261,14 +260,14 @@ def prepare_tracks(self): for lane in self.data: logging.debug("preparing tracks for lane: " + str(lane)) - if not "hgdb" in lane: + if "hgdb" not in lane: logging.error("Not using lane %s: no hgdb value" % lane) continue if lane["Index"] == "": lane["Index"] = "NoIndex" - if not lane["hgdb"] in self.subtrack_sets: + if lane["hgdb"] not in self.subtrack_sets: self.subtrack_sets[lane["hgdb"]] = [] if lane["aligner"] == "bwa": @@ -464,7 +463,7 @@ def create_ra(self, hgdb): samples = dict() for subtrack in subtracks: - if not subtrack["SampleID"] in subtrack: + if subtrack["SampleID"] not in subtrack: samples[subtrack["SampleID"]] = "%s %s %s %s" % ( subtrack["SampleID"], subtrack["CellType"], @@ -501,15 +500,15 @@ def create_ra(self, hgdb): ra.write("\tvisibility hide\n\n") for subtrack in subtracks: - if not "wellmapping-no-mito" in subtrack: + if "wellmapping-no-mito" not in subtrack: logging.warn( "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] ) subtrack["wellmapping-no-mito"] = "N/A" - if not "wellmapping" in subtrack: + if "wellmapping" not in subtrack: logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" - if not "SPOT" in subtrack: + if "SPOT" not in subtrack: logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) subtrack["SPOT"] = "N/A" @@ -635,7 +634,7 @@ def get(self, query): return self.get_by_url("%s/%s" % (self.api_url, query)) def get_by_url(self, url): - if not url in self.cache: + if url not in self.cache: self.cache[url] = requests.get( url, headers={"Authorization": "Token %s" % self.api_token} ).json() @@ -765,7 +764,7 @@ def main(args=sys.argv): data = json.loads(open(poptions.process_config, "r").read()) trackhubconfig = poptions.trackhub_config - projects = [d["code_name"] for d in data["projects"]] + # projects = [d["code_name"] for d in data["projects"]] # get basedir basedir = data["alignment_group"]["directory"] @@ -789,12 +788,12 @@ def main(args=sys.argv): load_groups = dict() # find projects - for l in data["libraries"]: - for a in l["alignments"]: - align_data = get_alignment_data(l, a, lims) + for lib in data["libraries"]: + for a in lib["alignments"]: + align_data = get_alignment_data(lib, a, lims) if not align_data["failed_lane"]: p = align_data["project"] - if not p in load_groups: + if p not in load_groups: load_groups[p] = [] load_groups[p].append(align_data) diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index cc8591a3..92fedeb5 100644 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -6,7 +6,6 @@ import argparse import logging import re -import copy import requests import datetime @@ -22,7 +21,7 @@ def foldercheck(*args): try: os.mkdir(folder) util_log.info("Created folder: %s" % folder) - except OSError as x: + except OSError: util_log.error("ERROR: Could not create directory: %s" % folder) util_log.warn( "Please make sure all nonexistant parent directories have been created." @@ -201,9 +200,9 @@ def prepare_tracks(self): for agg in self.data: # skip aggregations that are not completed - if agg["needs_reprocessing"] == True: + if agg["needs_reprocessing"]: continue - if agg["processing_completed"] == None: + if agg["processing_completed"] is None: continue tracks = {} @@ -280,7 +279,7 @@ def prepare_tracks(self): "Unknown template type, %s, for %s" % (agg["aggregation_process_template_id"], agg["id"]) ) - if not tracks["agg_genome"] in self.all_tracks: + if tracks["agg_genome"] not in self.all_tracks: self.all_tracks[tracks["agg_genome"]] = [] self.all_tracks[tracks["agg_genome"]].append(tracks) @@ -445,8 +444,8 @@ def main(args=sys.argv): poptions.priority, poptions.projectname, date, - poptions.base_api_url, - poptions.token, + api_url, + token, ) hubwriter.load() diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index 61594072..1bbbe36d 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -6,7 +6,6 @@ import argparse import logging import re -import copy import requests log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -32,7 +31,7 @@ def foldercheck(*args): try: os.mkdir(folder) util_log.info("Created folder: %s" % folder) - except OSError as x: + except OSError: util_log.error("ERROR: Could not create directory: %s" % folder) util_log.warn( "Please make sure all nonexistant parent directories have been created." @@ -230,14 +229,14 @@ def prepare_tracks(self): for lane in self.data: logging.debug("preparing tracks for lane: " + str(lane)) - if not "hgdb" in lane: + if "hgdb" not in lane: logging.error("Not using lane %s: no hgdb value" % lane) continue if lane["Index"] == "": lane["Index"] = "NoIndex" - if not lane["hgdb"] in self.subtrack_sets: + if lane["hgdb"] not in self.subtrack_sets: self.subtrack_sets[lane["hgdb"]] = [] if lane["aligner"] == "bwa": @@ -509,7 +508,7 @@ def create_subtrack_commands(self, subtrack, commandsout): # values (\"/usr/local/UW/bam-links/Rudensky/Rudensky_bams/$data.bam\");'" def create_genome_commands(self, hgdb, commandsout): - if not hgdb in self.genome_organisms: + if hgdb not in self.genome_organisms: logging.error(hgdb + " not in " + str(self.genome_organisms)) commandsout.write("\n ERROR: no " + hgdb + " genome\n") return @@ -575,7 +574,7 @@ def create_ra(self, hgdb): samples = dict() for subtrack in subtracks: - if not subtrack["SampleID"] in subtrack: + if subtrack["SampleID"] not in subtrack: samples[subtrack["SampleID"]] = "%s %s %s %s" % ( subtrack["SampleID"], subtrack["CellType"], @@ -612,15 +611,15 @@ def create_ra(self, hgdb): ra.write("\tvisibility hide\n\n") for subtrack in subtracks: - if not "wellmapping-no-mito" in subtrack: + if "wellmapping-no-mito" not in subtrack: logging.warn( "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] ) subtrack["wellmapping-no-mito"] = "N/A" - if not "wellmapping" in subtrack: + if "wellmapping" not in subtrack: logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" - if not "SPOT" in subtrack: + if "SPOT" not in subtrack: logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) subtrack["SPOT"] = "N/A" @@ -748,7 +747,7 @@ def get(self, query): return self.get_by_url("%s/%s" % (self.api_url, query)) def get_by_url(self, url): - if not url in self.cache: + if url not in self.cache: # print url self.cache[url] = requests.get( url, headers={"Authorization": "Token %s" % self.api_token} @@ -880,7 +879,7 @@ def main(args=sys.argv): data = json.loads(open(poptions.process_config, "r").read()) - projects = [d["code_name"] for d in data["projects"]] + # projects = [d["code_name"] for d in data["projects"]] # get basedir basedir = data["alignment_group"]["directory"] @@ -894,15 +893,15 @@ def main(args=sys.argv): browsers = set() load_groups = dict() - for l in data["libraries"]: - for a in l["alignments"]: + for lib in data["libraries"]: + for a in lib["alignments"]: if a["browsers"]: # Only process alignments that map to a browser - align_data = get_alignment_data(l, a, lims) + align_data = get_alignment_data(lib, a, lims) if not align_data["failed_lane"]: for b in a["browsers"]: browsers.add(b) key = (align_data["project"], b) - if not key in load_groups: + if key not in load_groups: load_groups[key] = [] load_groups[key].append(align_data) diff --git a/scripts/browser/parse_all_projects.py b/scripts/browser/parse_all_projects.py index d7cf1378..02a51d63 100644 --- a/scripts/browser/parse_all_projects.py +++ b/scripts/browser/parse_all_projects.py @@ -2,16 +2,12 @@ from __future__ import unicode_literals -import os, sys, logging, re +import os +import sys +import logging import requests -import json -import fileinput import argparse -try: - from concurrent.futures import ThreadPoolExecutor -except ImportError: - from futures import ThreadPoolExecutor log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/bwa/aggregate/basic/sparse_motifs.py b/scripts/bwa/aggregate/basic/sparse_motifs.py index a7da3741..5ae0d87c 100644 --- a/scripts/bwa/aggregate/basic/sparse_motifs.py +++ b/scripts/bwa/aggregate/basic/sparse_motifs.py @@ -1,5 +1,4 @@ import sys -import numpy as np from sklearn.datasets import dump_svmlight_file clusternames = sys.argv[1] @@ -23,7 +22,7 @@ motifs = line_split[9].split(";") newrow = [0] * len(fimos) for motif in motifs: - index = [n for n, l in enumerate(fimos) if l == motif] + index = [n for n, candidate in enumerate(fimos) if candidate == motif] newrow[index[0]] = 1 else: newrow = [0] * len(fimos) diff --git a/scripts/bwa/bamcounts.py b/scripts/bwa/bamcounts.py index 0260dd9a..434efcc5 100644 --- a/scripts/bwa/bamcounts.py +++ b/scripts/bwa/bamcounts.py @@ -7,7 +7,8 @@ Useful SAM flag reference: http://broadinstitute.github.io/picard/explain-flags.html """ -import os, sys, logging, re +import sys +import logging from collections import defaultdict from pysam import Samfile import argparse @@ -144,7 +145,7 @@ def process_read(self, read, inbam): if read.is_paired and not self.process_read_paired(read, inbam): return - nuclear = not chr in ("chrM", "chrC") + nuclear = chr not in ("chrM", "chrC") autosomal = nuclear and chr not in ("chrX", "chrY", "chrZ", "chrW") if nuclear: diff --git a/scripts/bwa/filter_reads.py b/scripts/bwa/filter_reads.py index 1c491dd5..e83fb6b2 100755 --- a/scripts/bwa/filter_reads.py +++ b/scripts/bwa/filter_reads.py @@ -10,10 +10,10 @@ face each other on the same reference and have an insert length wit """ +import argparse import sys import logging import pysam -import re """ Exception when a bad read is found @@ -32,7 +32,7 @@ class read_exception(Exception): def parse_umi(read): try: umi_loc = read.query_name.index("#") - except: + except Exception: pass else: read.set_tag("XD", read.query_name[umi_loc + 1 :]) @@ -85,8 +85,6 @@ def validate_read(read, min_mapq=1, max_mismatches=2): return read -import argparse - parser = argparse.ArgumentParser( prog="filter_reads", description="manual corrects the flags in a single- or pair-end BAM alignment file", @@ -150,12 +148,12 @@ def validate_read(read, min_mapq=1, max_mismatches=2): try: if not read1: read1 = parse_umi(next(raw_reads)) - except: + except Exception: break try: read2 = parse_umi(next(raw_reads)) - except: + except Exception: read2 = None # Continue in pair-end mode if their is two reads that are paired and that they have the same name @@ -225,12 +223,12 @@ def validate_read(read, min_mapq=1, max_mismatches=2): set_qc_fail(read1, qc_fail) set_proper_pair(read1, proper_pair) - if read1.reference_id != -1 and not read1.reference_name in nuclear_chrs: + if read1.reference_id != -1 and read1.reference_name not in nuclear_chrs: set_nonnuclear(read1, True) set_qc_fail(read2, qc_fail) set_proper_pair(read2, proper_pair) - if read2.reference_id != -1 and not read2.reference_name in nuclear_chrs: + if read2.reference_id != -1 and read2.reference_name not in nuclear_chrs: set_nonnuclear(read2, True) # Write file @@ -266,7 +264,7 @@ def validate_read(read, min_mapq=1, max_mismatches=2): set_qc_fail(read1, qc_fail) set_proper_pair(read1, False) - if read1.reference_id != -1 and not read1.reference_name in nuclear_chrs: + if read1.reference_id != -1 and read1.reference_name not in nuclear_chrs: set_nonnuclear(read1, True) filtered_alignment.write(read1) diff --git a/scripts/bwa/fix_bam_pairing.py b/scripts/bwa/fix_bam_pairing.py index 8fae86f3..56202b14 100755 --- a/scripts/bwa/fix_bam_pairing.py +++ b/scripts/bwa/fix_bam_pairing.py @@ -1,6 +1,5 @@ #!/bin/env python3 -import sys import pysam import argparse @@ -34,7 +33,7 @@ read2 = unfiltered_reads.next() (read1, read2) = (read1, read2) if read1.is_read1 else (read2, read1) - except: + except Exception: break # strip off the umi, and place it in a custom tag (if it exists) @@ -44,7 +43,7 @@ read1_umi_loc = read1.qname.index("#") read2_umi_loc = read2.qname.index("#") - except: + except Exception: pass else: @@ -73,7 +72,7 @@ if read1.isize > 750 or read2.isize > 750: raise - except: + except Exception: # failed a test above, not properly paired read1.flag &= ~(1 << 1) read2.flag &= ~(1 << 1) diff --git a/scripts/cluster/monitor_alignments.py b/scripts/cluster/monitor_alignments.py index 64f67bee..c93a2d20 100644 --- a/scripts/cluster/monitor_alignments.py +++ b/scripts/cluster/monitor_alignments.py @@ -1,8 +1,9 @@ -import os, sys, logging, re +import os +import sys +import logging +import re import requests -import json import argparse -import datetime import subprocess import xml.dom.minidom @@ -203,7 +204,7 @@ def lims_currently_processing(self): ) lims_process_align_ids = set() - if fetch_results == None: + if fetch_results is None: log.critical("Could not get list of currently processing alignments") sys.exit(1) @@ -220,7 +221,7 @@ def update_host_info(self): host = run_command("hostname").split(".")[0] key = "%s-usage" % host url = "%s/key_value/?key=%s" % (self.api_url, key) - key_value = self.get_single_result("%s/key_value/?key=%s" % (self.api_url, key)) + key_value = self.get_single_result(url) if not key_value: log.error("Cannot find '%s' key value" % key) return diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index 41c0bf0d..2fb5a0fd 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -6,7 +6,12 @@ TODO: Generalize some of these functions into a library. TODO: This might be more robust as a cron job.""" -import os, re, time, logging, smtplib, json +import os +import re +import time +import logging +import smtplib +import json from email.mime.text import MIMEText import xml from xml.dom import minidom @@ -182,7 +187,7 @@ def get_folder_info(sequencer_folder): if sequencer_folder.find(server) > 0: info["server"] = server - if not "server" in info: + if "server" not in info: info["server"] = "UNKNOWN" match = folder_pattern_ga.search(sequencer_folder) diff --git a/scripts/create_processing.py b/scripts/create_processing.py index fa075448..f7cf8e8e 100644 --- a/scripts/create_processing.py +++ b/scripts/create_processing.py @@ -188,7 +188,7 @@ def __init__(self, args): self.template_script_content = open(self.template_script, "r").read() def include_lane(self, lane): - if self.umi_filter != None: + if self.umi_filter is not None: if lane["barcode1"] and lane["barcode1"]["umi"]: umi = True else: @@ -199,7 +199,7 @@ def include_lane(self, lane): if not self.umi_filter and umi: return False - if self.filter_lanes and not lane["lane"] in self.filter_lanes: + if self.filter_lanes and lane["lane"] not in self.filter_lanes: return False if self.ignore_failed_lanes and lane["failed"]: @@ -209,19 +209,19 @@ def include_lane(self, lane): ) return False - if self.project_filter and not (lane["project"] in self.project_filter): + if self.project_filter and lane["project"] not in self.project_filter: logging.debug( "Skipping %s, not in project filter" % lane["samplesheet_name"] ) return False - if self.library_filter and not (lane["library"] in self.library_filter): + if self.library_filter and lane["library"] not in self.library_filter: logging.debug( "Skipping %s, not in library filter" % lane["samplesheet_name"] ) return False - if self.sample_filter and not (lane["sample"] in self.sample_filter): + if self.sample_filter and lane["sample"] not in self.sample_filter: logging.debug( "Skipping %s, not in sample filter" % lane["samplesheet_name"] ) @@ -230,7 +230,7 @@ def include_lane(self, lane): if ( self.alignment_filter and lane["alignments"] - and not (lane["alignments"][0]["id"] in self.alignment_filter) + and lane["alignments"][0]["id"] not in self.alignment_filter ): logging.debug( "Skipping %s, not in alignment filter" % lane["samplesheet_name"] @@ -255,7 +255,7 @@ def create(self): self.run_scripts() def add_script(self, script_file, sample_name, priority): - if not priority in self.processing_scripts: + if priority not in self.processing_scripts: self.processing_scripts[priority] = list() self.processing_scripts[priority].append((sample_name, script_file)) @@ -294,7 +294,7 @@ def get_script_template(self, lane): base_script = alignment["aligner"] logging.info("# Aligning %s with %s" % (lane["sample"], base_script)) - if not base_script in script_contents: + if base_script not in script_contents: script_contents[base_script] = open(script_files[base_script], "r").read() return script_contents[base_script] diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py index 8600430f..c87aa382 100644 --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -126,9 +126,9 @@ def main(argv): bcs = bc1 lane = lib["lane"] - for l in barcodes["Lanes"]: - if lane == l["LaneIndex"]: - if bcs in l["Counts"]: + for lane in barcodes["Lanes"]: + if lane == lane["LaneIndex"]: + if bcs in lane["Counts"]: next else: print(lib) diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py index f3cd372e..0e109b13 100644 --- a/scripts/flowcells/barcode_count_from_stats_file.py +++ b/scripts/flowcells/barcode_count_from_stats_file.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -import os, sys, re import json import argparse import logging diff --git a/scripts/flowcells/barcode_report.py b/scripts/flowcells/barcode_report.py index 5307d571..1901b90c 100755 --- a/scripts/flowcells/barcode_report.py +++ b/scripts/flowcells/barcode_report.py @@ -1,6 +1,11 @@ #!/usr/bin/env python3 -import os, sys, logging, re, json, argparse, glob +import os +import sys +import re +import json +import argparse +import glob script_options = { "quiet": False, @@ -56,7 +61,7 @@ def sum_barcodes(input_files): continue count, barcode = words barcode = barcode.replace("+", "-") - if not barcode in totals: + if barcode not in totals: totals[barcode] = 0 totals[barcode] += int(count) f.close() @@ -67,15 +72,15 @@ def sum_barcodes(input_files): def get_input_files_for_lane(data, lane, basedir): globs = [] files = [] - for l in data["libraries"]: + for lib in data["libraries"]: # TODO: Allow to work with NoIndex samples that span a lane - if l["lane"] == lane: + if lib["lane"] == lane: name = "Project_%s/Sample_%s/%s_%s_L%03d_R1_???.barcodes.txt" % ( - l["project"], - l["samplesheet_name"], - l["samplesheet_name"], - l["realbarcode"], - l["lane"], + lib["project"], + lib["samplesheet_name"], + lib["samplesheet_name"], + lib["realbarcode"], + lib["lane"], ) globs.append(os.path.join(basedir, name)) @@ -92,7 +97,7 @@ def apply_mask(mask, barcode_string): orig_barcodes = barcode_string.split("-") while len(orig_barcodes) < len(mask): orig_barcodes.append("") - barcodes = [orig_barcodes[i][:l] for (i, l) in enumerate(mask)] + barcodes = [orig_barcodes[i][:length] for (i, length) in enumerate(mask)] return barcodes @@ -104,16 +109,16 @@ def parse_bases_mask(mask_string): def get_expected_barcodes(data): mask = parse_bases_mask(data["alignment_group"]["bases_mask"]) libraries = {} - for l in data["libraries"]: - if l["barcode_index"] == "NoIndex": + for lib in data["libraries"]: + if lib["barcode_index"] == "NoIndex": barcode = None else: - barcode = "-".join(apply_mask(mask, l["barcode_index"])) - lane = l["lane"] - l["realbarcode"] = barcode - if not lane in libraries: + barcode = "-".join(apply_mask(mask, lib["barcode_index"])) + lane = lib["lane"] + lib["realbarcode"] = barcode + if lane not in libraries: libraries[lane] = {} - libraries[lane][barcode] = l + libraries[lane][barcode] = lib return libraries @@ -182,7 +187,7 @@ def main(args=sys.argv): if poptions.lane: lanes = [poptions.lane] else: - lanes = sorted(list(set([l["lane"] for l in processing_data["libraries"]]))) + lanes = sorted(list(set([lib["lane"] for lib in processing_data["libraries"]]))) # Get actual barcodes and merge with expected compiled_stats = {} diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py index 16b4327a..ad4188f7 100644 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -6,13 +6,11 @@ import itertools import json import logging -import operator import os import re import subprocess import sys -from Bio import SeqIO from Bio.SeqIO.QualityIO import FastqGeneralIterator log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/flowcells/link_rapidrun.py b/scripts/flowcells/link_rapidrun.py index fff50909..6f6cf3e8 100644 --- a/scripts/flowcells/link_rapidrun.py +++ b/scripts/flowcells/link_rapidrun.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import os, sys, logging, re -import requests +import os +import sys +import logging +import re import json -import fileinput import argparse import glob diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index 7ce0a8db..5cb82c52 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -12,7 +12,6 @@ # requires BioPython which seems to be in our environment # but only to reverse complement which we could figure out # another way to do -from Bio.Seq import Seq # Usage: $0 -p processing.json @@ -57,9 +56,6 @@ def get_barcode_assignments( ) -> "[dict]": assignments = [] - # This will store our pool samplesheet lines - pool_assignment_set = set() - for libdata in data["libraries"]: assignment = { "lane": libdata.get("lane"), diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index 719be522..e663b538 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -8,7 +8,8 @@ # For each setting (most permissive to least), generate all possible barcodes for each lane # If there are any collisions, check the next tighter mismatch setting -import os, sys, logging, re, math +import sys +import re import argparse import itertools import json @@ -52,7 +53,7 @@ def gen_snps(word, mismatches): thisWord = [[char] for char in word] for loc in locs: origChar = word[loc] - thisWord[loc] = [l for l in "ACGTN" if l != origChar] + thisWord[loc] = [letter for letter in "ACGTN" if letter != origChar] for poss in itertools.product(*thisWord): yield "".join(poss) @@ -107,7 +108,7 @@ def apply_mask(mask, barcode_string): orig_barcodes = barcode_string.split("-") while len(orig_barcodes) < len(mask): orig_barcodes.append("") - barcodes = [orig_barcodes[i][:l] for (i, l) in enumerate(mask)] + barcodes = [orig_barcodes[i][:length] for (i, length) in enumerate(mask)] return barcodes diff --git a/scripts/helpers/expand_multiple_alignments.py b/scripts/helpers/expand_multiple_alignments.py index 53170e27..eb02ff8b 100755 --- a/scripts/helpers/expand_multiple_alignments.py +++ b/scripts/helpers/expand_multiple_alignments.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 + +from collections import namedtuple import sys import copy @@ -59,8 +61,6 @@ def get_secondary_aligns(read): ) -from collections import namedtuple - Alignment = namedtuple("Alignment", "chr pos cigar") @@ -170,7 +170,7 @@ def valid_pair(i, j): # print("r2", r2) # print("s1", s1) # print("s2", s2) - combinations = [] + # combinations = [] for i in s1: for j in s2: pass diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 24784f79..039de12b 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -1,6 +1,5 @@ """This script is deprecated!""" -import json import os import sys import argparse @@ -253,7 +252,7 @@ def get_script_template(self): def create_script(self, processing_info): lane = processing_info["libraries"][0] - if not "directory" in lane: + if "directory" not in lane: logging.critical("No directory for lane %d" % lane["id"]) return False diff --git a/scripts/lims/aggregation/get_files.py b/scripts/lims/aggregation/get_files.py index 0b08ea88..6d857c8c 100644 --- a/scripts/lims/aggregation/get_files.py +++ b/scripts/lims/aggregation/get_files.py @@ -1,12 +1,9 @@ -import os, sys, logging, re +import os +import sys +import logging import requests -import json -import fileinput import argparse -import datetime -import hashlib import string -from zipfile import ZipFile token = None headers = None diff --git a/scripts/lims/alignment/get_files.py b/scripts/lims/alignment/get_files.py index 46699089..b4a08960 100644 --- a/scripts/lims/alignment/get_files.py +++ b/scripts/lims/alignment/get_files.py @@ -1,12 +1,9 @@ -import os, sys, logging, re +import os +import sys +import logging import requests -import json -import fileinput import argparse -import datetime -import hashlib import string -from zipfile import ZipFile token = None headers = None @@ -271,7 +268,7 @@ def retrieve(self, args): sys.exit(1) if args.alignment_id: - self.retrieve_file(alignment_id, file_purpose) + self.retrieve_file(args.alignment_id, file_purpose) lanes = self.find_lanes(args) diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index 21e8b413..f930fb0e 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -1,9 +1,6 @@ import json -import os -import sys import argparse import logging -import re from collections import defaultdict diff --git a/scripts/lims/get_processing.py b/scripts/lims/get_processing.py index da616987..2744a991 100644 --- a/scripts/lims/get_processing.py +++ b/scripts/lims/get_processing.py @@ -1,16 +1,12 @@ from __future__ import unicode_literals -import os, sys, logging, re +import os +import sys +import logging import requests import json -import fileinput import argparse -try: - from concurrent.futures import ThreadPoolExecutor -except ImportError: - from futures import ThreadPoolExecutor - log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" script_options = { diff --git a/scripts/lims/movetag.py b/scripts/lims/movetag.py index 035e5045..2e25e3c7 100644 --- a/scripts/lims/movetag.py +++ b/scripts/lims/movetag.py @@ -1,7 +1,7 @@ -import os, sys, logging, re +import os +import sys +import logging import requests -import json -import fileinput import argparse token = None @@ -73,7 +73,7 @@ def __init__(self, api_url, token): self.contenttypes = {} def get_contenttype(self, contenttype): - if not contenttype in self.contenttypes: + if contenttype not in self.contenttypes: contenttype_url = "%s/content_type/?model=%s" % (self.api_url, contenttype) contenttype_results = requests.get( contenttype_url, headers=self.headers @@ -88,16 +88,15 @@ def get_tag(self, slug): exists = requests.get( "%s/tag/?slug=%s" % (self.api_url, slug), headers=self.headers ) - tag = None if exists.ok: results = exists.json() if results["count"] > 0: return results["results"][0] else: - print("Tag %s not found" % slug) + logging.error("Tag %s not found", slug) return None else: - print("Error finding tag %s through API" % slug) + logging.error("Error finding tag %s through API", slug) return None def change_tag(self, contenttype, object_id, old_tag, new_tag): diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py index c1ad3de4..d402f268 100644 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -1,18 +1,16 @@ import sys -import json import argparse import logging +from stamlims_api.rest import setup_api + # Change to logging.DEBUG to see all messages logging.basicConfig(level=logging.WARN) log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log = logging.getLogger(__name__) -from stamlims_api.rest import setup_api -from stamlims_api.lims import aggregations, metrics - def parser_setup(): parser = argparse.ArgumentParser() @@ -59,18 +57,6 @@ def upload_stats(api, aggregation, stats={}): log.info(response) -def upload_spot(api, aggregation, spot_file): - if not os.path.exists(spot_file): - log.error("Cannot find spot file %s" % spot_file) - return - spot = open(spot_file, "r").read().strip() - try: - spot = Decimal(spot) - upload_stat(api, aggregation, "hotspot2-SPOT", spot) - except ValueError: - log.error("Could not turn %s into decimal" % spot) - - def upload_file(api, aggregation, counts_file): count_content = open(counts_file, "r") @@ -128,7 +114,7 @@ def main(args=sys.argv): if poptions.aggregation_id is None: log.critical("No --aggregation specified") sys.exit(2) - aggregation = aggregations.get_aggregation(api, poptions.aggregation_id) + # aggregation = aggregations.get_aggregation(api, poptions.aggregation_id) if poptions.counts_file: for count_file in poptions.counts_file: upload_file(api, poptions.aggregation_id, count_file) diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 389acac7..2597390e 100644 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -15,7 +15,6 @@ 1, os.path.join(os.path.dirname(os.path.abspath(__file__)), "stamlims_api") ) -from stamlims_api.lims import aggregations, content_types from stamlims_api import rest lane_tags = None @@ -619,7 +618,11 @@ def upload_file_attachment( ) # Allow for sloppiness in NFS timestamps difference = recorded_mtime - last_modified - if timedelta(minutes=-1) <= difference <= timedelta(minutes=1): + if ( + datetime.timedelta(minutes=-1) + <= difference + <= datetime.timedelta(minutes=1) + ): log.info("File exists and matches recorded size, skipping %s" % path) return @@ -705,7 +708,7 @@ def create_count_type(self, name): # TODO : make sure that no more of one count type exists def get_alignment_counts(self, alignment_id): log.info("Getting alignment counts for %d" % alignment_id) - if not alignment_id in self.alignment_counts: + if alignment_id not in self.alignment_counts: counts = self.get_list_result( "flowcell_lane_count/", query={"alignment": alignment_id} ) @@ -1088,14 +1091,14 @@ def upload_picard_metric( picard_metric = None try: picard_metric = open(filename, "r").read() - except: + except Exception: log.error("Could not read picard metric file %s" % filename) return None log.debug("Uploading metric contents from: %s" % filename) log.debug(picard_metric) - if not metric_name in self.picard_metrics: + if metric_name not in self.picard_metrics: log.error("Could not find metrics type %s" % metric_name) return False diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 0c017e4c..c2d5eb23 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -376,7 +376,7 @@ def extract_id_from_url(url): pool_id = extract_id_from_url(pool_url) LANES_WITH_DIRECT_POOL[lane["id"]] = pool_id pool_key = (pool_id, lane_lane) - pool_number = int(lane["library_pool__number"]) + # pool_number = int(lane["library_pool__number"]) # Get Library info lp_info = self.api_single_result(url=pool_url) @@ -536,7 +536,7 @@ def create_script(self, processing_info, align_id): lane = processing_info["libraries"][0] alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] - if not "process_template" in alignment: + if "process_template" not in alignment: logging.error("Alignment %d has no process template" % align_id) return False @@ -632,7 +632,7 @@ def create_script(self, processing_info, align_id): env_vars["LIBRARY_KIT"] = ( '"' + processing_info["libraries"][0]["library_kit_method"] + '"' ) - except: + except Exception: env_vars["LIBRARY_KIT"] = None if processing_info["flowcell"]["paired_end"]: @@ -686,7 +686,7 @@ def create_script(self, processing_info, align_id): ) for var, value in process_template_variables.items(): env_vars[var] = value - except ValueError as e: + except ValueError: logging.error( "Could not parse process variables for align %d (template %d): '%s'" % ( @@ -746,7 +746,6 @@ def get_libraries_in_pool(alignment_id): # Get all lane ids # Go up to the pool then down to the lanes # Note: This is inefficient but probably doesnt matter in practice - lanes = [] lanes_with_align = set() for lane_id, aln_ids in LANE_ID_TO_ALN_IDS.items(): if alignment_id in aln_ids: @@ -891,7 +890,7 @@ def build_effector_info(effectortopool): ], } ) - except: + except Exception: add_error( "Could not get effector information for sample DS%s", sample_info["number"], @@ -950,7 +949,7 @@ def info_to_data(well_info): wells.append(well_data) return wells - def reverse_complement(bc: "Optional[str]") -> "Optional[str]": + def reverse_complement(bc: "Optional[str]") -> "Optional[str]": # noqa: F821 if bc is None: return None lookup = {"A": "T", "T": "A", "C": "G", "G": "C"} @@ -1001,7 +1000,7 @@ def reverse_complement(bc: "Optional[str]") -> "Optional[str]": "PL%d" % lib_plate_wells[0]["well_parent"]["well_parent"]["plate_id"] ) - except Exception as e: + except Exception: add_error("Could not find well info in %s", lib_plate_wells) def sort_talens(tls): @@ -1038,7 +1037,7 @@ def get_num(tl): else: talen_name = None - lenti_qc_passed = lenti_from_tc["effector_assembly_qc"] is None + # lenti_qc_passed = lenti_from_tc["effector_assembly_qc"] is None if sample_info["time_point_unit"] == 5: # harvest timepoint is in days diff --git a/scripts/umi/extract_umt.py b/scripts/umi/extract_umt.py index 9d101e95..ee66e634 100755 --- a/scripts/umi/extract_umt.py +++ b/scripts/umi/extract_umt.py @@ -4,7 +4,6 @@ import argparse import itertools import logging -import string import sys log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -40,7 +39,7 @@ def mismatch(word, mismatches): thisWord = [[char] for char in word] for loc in locs: origChar = word[loc] - thisWord[loc] = [l for l in "ACGTN" if l != origChar] + thisWord[loc] = [letter for letter in "ACGTN" if letter != origChar] for poss in itertools.product(*thisWord): yield "".join(poss) diff --git a/scripts/utility/movesymlinks.py b/scripts/utility/movesymlinks.py index 54010b54..9ae0fdff 100644 --- a/scripts/utility/movesymlinks.py +++ b/scripts/utility/movesymlinks.py @@ -5,7 +5,10 @@ base path of Y. """ -import sys, os, argparse, logging +import sys +import os +import argparse +import logging log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/utility/picard_inserts_process.py b/scripts/utility/picard_inserts_process.py index 49687c21..12be3a1b 100644 --- a/scripts/utility/picard_inserts_process.py +++ b/scripts/utility/picard_inserts_process.py @@ -1,8 +1,7 @@ -import numpy, re, sys +import numpy +import sys from copy import copy -from datetime import datetime -from sklearn.preprocessing import normalize # returns the convolutional or statistical autocorrelation of the passed histogram @@ -53,7 +52,7 @@ def normalize_hist(hist, mode="max"): def diff_norm(df, idx, hist_name="agg_hist", mode="max", r=(50, 140)): try: c = copy(get_arr(df[hist_name].iloc[idx])[r[0] : r[1]]) - except: + except Exception: c = copy(df[hist_name].iloc[idx][r[0] : r[1]]) return numpy.diff(normalize_hist(c, mode=mode)) @@ -66,7 +65,7 @@ def get_peak_loc(hist, r=(100, 106), npeaks=1): n_highest = sorted(hist[r[0] : r[1]], reverse=True)[:npeaks] for n in n_highest: peaks.append(hist.index(n)) - except: + except Exception: peaks = numpy.nan return peaks @@ -77,7 +76,7 @@ def get_large_small_ratio(hist, r=(60, 120)): try: rt = (hist[r[1]]) / (hist[r[0]]) ratio = rt - except: + except Exception: ratio = numpy.nan return ratio @@ -92,7 +91,7 @@ def f_transform(hist, position=11): pa = numpy.nan pv = numpy.nan vat = numpy.nan - adp = numpy.nan + _adp = numpy.nan adv = numpy.nan try: norm_hist = normalize_hist(get_arr(hist), mode="max") @@ -102,15 +101,11 @@ def f_transform(hist, position=11): m = numpy.argmax(ft) adj = numpy.mean(ft) pa = m - pv = ft[m] + pv = ft[pa] vat = ft[11] - adp = ft[m] - adj - adv = ft[11] - adj - except: - pa = numpy.nan - pv = numpy.nan - vat = numpy.nan - adp = numpy.nan + _adp = pv - adj + adv = vat - adj + except Exception: adv = numpy.nan return adv @@ -123,7 +118,7 @@ def read_hist(file): try: (key, val) = line.split() d[int(key)] = int(val) - except: + except Exception: next return d From 94e3c64157f07178a9cda2a704d6fbe328c44ff2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 12:15:19 -0700 Subject: [PATCH 150/172] style: order imports in python scripts with ruff --- .pre-commit-config.yaml | 9 +++++++++ processes/altcode/bin/mtx_to_h5.py | 2 +- processes/altcode/bin/summarize_stats.py | 1 - processes/altseq/bin/generate_counts_json.py | 2 +- processes/bwa/aggregate/plot_footprints.py | 15 ++++++--------- scripts/aggregatecollate.py | 5 ++--- scripts/aggregateprocess.py | 7 ++++--- scripts/alignprocess.py | 7 ++++--- scripts/altcode/upload_fastq.py | 2 +- scripts/altcode/upload_stats.py | 6 +++--- scripts/altseq/upload_data.py | 6 +++--- scripts/apilaneprocess.py | 8 ++++---- scripts/bam/bamfaiordercompare.py | 2 +- scripts/bam/mark_dups.py | 3 ++- scripts/bam/move_umt_to_tag.py | 1 + scripts/bam/random_reads.py | 3 ++- scripts/browser/make_trackhubs_for_flowcell.py | 7 ++++--- scripts/browser/make_trackhubs_for_projects.py | 9 +++++---- .../old_native_fc_loading/make_browser_load.py | 7 ++++--- scripts/browser/parse_all_projects.py | 6 +++--- scripts/bwa/aggregate/basic/sparse_motifs.py | 1 + scripts/bwa/bamcounts.py | 5 +++-- scripts/bwa/filter_reads.py | 3 ++- scripts/bwa/fix_bam_pairing.py | 3 ++- scripts/cluster/monitor_alignments.py | 9 +++++---- scripts/copy_notify.py | 8 ++++---- scripts/create_processing.py | 4 ++-- scripts/flowcells/barcode_check.py | 2 +- .../flowcells/barcode_count_from_stats_file.py | 2 +- scripts/flowcells/barcode_masks.py | 4 ++-- scripts/flowcells/barcode_report.py | 8 ++++---- scripts/flowcells/link_rapidrun.py | 10 +++++----- scripts/flowcells/make_samplesheets.py | 1 - scripts/flowcells/max_mismatch.py | 4 ++-- scripts/flowcells/test_barcode_masks.py | 3 +-- scripts/helpers/expand_multiple_alignments.py | 4 ++-- scripts/laneprocess.py | 5 +++-- scripts/lims/aggregation/get_files.py | 7 ++++--- scripts/lims/alignment/get_files.py | 7 ++++--- scripts/lims/create_altseq_sample_config.py | 3 +-- scripts/lims/get_processing.py | 7 ++++--- scripts/lims/movetag.py | 5 +++-- scripts/lims/upload_aggregation_stats.py | 3 +-- scripts/rename_by_prefix.py | 4 ++-- scripts/umi/extract_umt.py | 3 ++- scripts/umi/fastq_umi_add.py | 2 +- scripts/utility/movesymlinks.py | 4 ++-- scripts/utility/picard_inserts_process.py | 4 ++-- 48 files changed, 126 insertions(+), 107 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..f82d48ff --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.5.1 + hooks: + # Run the linter. + - id: ruff + # Run the formatter. + - id: ruff-format diff --git a/processes/altcode/bin/mtx_to_h5.py b/processes/altcode/bin/mtx_to_h5.py index 9cfe7120..864bb31b 100755 --- a/processes/altcode/bin/mtx_to_h5.py +++ b/processes/altcode/bin/mtx_to_h5.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 import argparse -import logging import json +import logging import scanpy as sc diff --git a/processes/altcode/bin/summarize_stats.py b/processes/altcode/bin/summarize_stats.py index 28dfc550..51fb230a 100755 --- a/processes/altcode/bin/summarize_stats.py +++ b/processes/altcode/bin/summarize_stats.py @@ -7,7 +7,6 @@ import math import os import re - from collections import defaultdict diff --git a/processes/altseq/bin/generate_counts_json.py b/processes/altseq/bin/generate_counts_json.py index 5952bebf..04fb5475 100755 --- a/processes/altseq/bin/generate_counts_json.py +++ b/processes/altseq/bin/generate_counts_json.py @@ -2,10 +2,10 @@ import argparse import csv -import os # import pprint import json +import os def parse_args(): diff --git a/processes/bwa/aggregate/plot_footprints.py b/processes/bwa/aggregate/plot_footprints.py index 09622358..35e06443 100755 --- a/processes/bwa/aggregate/plot_footprints.py +++ b/processes/bwa/aggregate/plot_footprints.py @@ -5,26 +5,23 @@ import sys + +import matplotlib +import matplotlib.gridspec as gridspec +import matplotlib.pyplot as plt import numpy as np import scipy import scipy.stats - from footprint_tools.modeling import dispersion - +from matplotlib.ticker import MaxNLocator +from pylab import rcParams # In[3]: # get_ipython().magic(u'matplotlib inline') -import matplotlib - matplotlib.use("agg") -import matplotlib.pyplot as plt -from matplotlib.ticker import MaxNLocator -import matplotlib.gridspec as gridspec - -from pylab import rcParams rcParams["pdf.fonttype"] = 42 diff --git a/scripts/aggregatecollate.py b/scripts/aggregatecollate.py index bad9c687..2dcc230a 100644 --- a/scripts/aggregatecollate.py +++ b/scripts/aggregatecollate.py @@ -1,14 +1,13 @@ -import os -import sys import argparse import logging +import os +import sys sys.path.append("/home/audrakj/stamlims_api") print(sys.path) from stamlims_api import rest # noqa: E402 from stamlims_api.lims import files # noqa: E402 - log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index af62b9ae..cd777db1 100644 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -1,11 +1,12 @@ +import argparse import json +import logging import os import sys -import argparse -import logging -import requests from collections import OrderedDict +import requests + try: from concurrent.futures import ThreadPoolExecutor except ImportError: diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 90ba5f91..4a4a788f 100644 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -1,12 +1,13 @@ +import argparse import json +import logging import os import sys -import argparse -import logging -import requests import textwrap from collections import OrderedDict +import requests + try: from concurrent.futures import ThreadPoolExecutor except ImportError: diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index e2fde5a2..8bc27516 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -3,12 +3,12 @@ Uploads alt-code fastq files to LIMS """ -import re import argparse import datetime import hashlib import logging import os +import re import sys from functools import lru_cache diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py index d7042d49..62fc8ffc 100644 --- a/scripts/altcode/upload_stats.py +++ b/scripts/altcode/upload_stats.py @@ -3,18 +3,18 @@ Uploads all the results of alt-code processing to LIMS """ -import re import argparse import datetime +import glob import hashlib import json import logging import os +import re import sys -import glob -from functools import lru_cache from collections import defaultdict from distutils.version import LooseVersion +from functools import lru_cache # Make sure we can load our vendored stamlims_api dependency sys.path.insert( diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index ccaf448d..51db031c 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -3,17 +3,17 @@ Uploads all the results of alt-seq processing to LIMS """ -import re -import csv import argparse +import csv import datetime import hashlib import json import logging import os +import re import sys -from functools import lru_cache from collections import defaultdict +from functools import lru_cache # Make sure we can load our vendored stamlims_api dependency sys.path.insert( diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index cd5a540d..26dc51bb 100644 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -1,16 +1,16 @@ -import os -import sys import argparse +import collections import logging +import os import re -import collections +import sys try: from concurrent.futures import ThreadPoolExecutor except ImportError: from futures import ThreadPoolExecutor -from stamlims_api import rest, lims +from stamlims_api import lims, rest log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/bam/bamfaiordercompare.py b/scripts/bam/bamfaiordercompare.py index a521eff2..6ecb0635 100644 --- a/scripts/bam/bamfaiordercompare.py +++ b/scripts/bam/bamfaiordercompare.py @@ -4,8 +4,8 @@ file. Prints "ORDERED" if they are equal and "UNORDERED" otherwise. """ -import sys import os +import sys import pysam diff --git a/scripts/bam/mark_dups.py b/scripts/bam/mark_dups.py index acd8bd7f..7dbc2b92 100755 --- a/scripts/bam/mark_dups.py +++ b/scripts/bam/mark_dups.py @@ -2,9 +2,10 @@ import argparse import sys -import pysam from collections import defaultdict +import pysam + UMI_TAG = "XD:Z" diff --git a/scripts/bam/move_umt_to_tag.py b/scripts/bam/move_umt_to_tag.py index b19a0b7b..43c4c68f 100755 --- a/scripts/bam/move_umt_to_tag.py +++ b/scripts/bam/move_umt_to_tag.py @@ -7,6 +7,7 @@ """ import argparse + import pysam diff --git a/scripts/bam/random_reads.py b/scripts/bam/random_reads.py index d67fe777..6d562021 100644 --- a/scripts/bam/random_reads.py +++ b/scripts/bam/random_reads.py @@ -1,9 +1,10 @@ def main(): import argparse import random - import pysam import shutil + import pysam + parser = argparse.ArgumentParser() parser.add_argument("infile") parser.add_argument("outfile") diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index bb48ce77..34678ef2 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -import json -import os -import sys import argparse +import json import logging +import os import re +import sys + import requests log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index 92fedeb5..519c6e5e 100644 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 -import json -import os -import sys import argparse +import datetime +import json import logging +import os import re +import sys + import requests -import datetime log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.getLogger("requests").setLevel(logging.WARNING) diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index 1bbbe36d..90016f73 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -import json -import os -import sys import argparse +import json import logging +import os import re +import sys + import requests log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/browser/parse_all_projects.py b/scripts/browser/parse_all_projects.py index 02a51d63..481438da 100644 --- a/scripts/browser/parse_all_projects.py +++ b/scripts/browser/parse_all_projects.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals +import argparse +import logging import os import sys -import logging -import requests -import argparse +import requests log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/bwa/aggregate/basic/sparse_motifs.py b/scripts/bwa/aggregate/basic/sparse_motifs.py index 5ae0d87c..942afb53 100644 --- a/scripts/bwa/aggregate/basic/sparse_motifs.py +++ b/scripts/bwa/aggregate/basic/sparse_motifs.py @@ -1,4 +1,5 @@ import sys + from sklearn.datasets import dump_svmlight_file clusternames = sys.argv[1] diff --git a/scripts/bwa/bamcounts.py b/scripts/bwa/bamcounts.py index 434efcc5..308803d6 100644 --- a/scripts/bwa/bamcounts.py +++ b/scripts/bwa/bamcounts.py @@ -7,11 +7,12 @@ Useful SAM flag reference: http://broadinstitute.github.io/picard/explain-flags.html """ -import sys +import argparse import logging +import sys from collections import defaultdict + from pysam import Samfile -import argparse log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/bwa/filter_reads.py b/scripts/bwa/filter_reads.py index e83fb6b2..164c8cf9 100755 --- a/scripts/bwa/filter_reads.py +++ b/scripts/bwa/filter_reads.py @@ -11,8 +11,9 @@ """ import argparse -import sys import logging +import sys + import pysam """ diff --git a/scripts/bwa/fix_bam_pairing.py b/scripts/bwa/fix_bam_pairing.py index 56202b14..3b787fa8 100755 --- a/scripts/bwa/fix_bam_pairing.py +++ b/scripts/bwa/fix_bam_pairing.py @@ -1,8 +1,9 @@ #!/bin/env python3 -import pysam import argparse +import pysam + parser = argparse.ArgumentParser(description="Set read pair status") parser.add_argument( "infile", diff --git a/scripts/cluster/monitor_alignments.py b/scripts/cluster/monitor_alignments.py index c93a2d20..c1c965c4 100644 --- a/scripts/cluster/monitor_alignments.py +++ b/scripts/cluster/monitor_alignments.py @@ -1,12 +1,13 @@ -import os -import sys +import argparse import logging +import os import re -import requests -import argparse import subprocess +import sys import xml.dom.minidom +import requests + token = None headers = None lane_tags = None diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index 2fb5a0fd..d47ce734 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -6,14 +6,14 @@ TODO: Generalize some of these functions into a library. TODO: This might be more robust as a cron job.""" +import json +import logging import os import re -import time -import logging import smtplib -import json -from email.mime.text import MIMEText +import time import xml +from email.mime.text import MIMEText from xml.dom import minidom """ diff --git a/scripts/create_processing.py b/scripts/create_processing.py index f7cf8e8e..5d1250fb 100644 --- a/scripts/create_processing.py +++ b/scripts/create_processing.py @@ -1,8 +1,8 @@ +import argparse import json +import logging import os import sys -import argparse -import logging log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py index c87aa382..0239f193 100644 --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -1,6 +1,6 @@ -import sys import argparse import json +import sys MAX_BARCODE_LENGTH = 10 diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py index 0e109b13..7aa54b66 100644 --- a/scripts/flowcells/barcode_count_from_stats_file.py +++ b/scripts/flowcells/barcode_count_from_stats_file.py @@ -1,7 +1,7 @@ #!/usr/bin/env python -import json import argparse +import json import logging default_options = { diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py index 7ae3e4ad..252c7cf2 100644 --- a/scripts/flowcells/barcode_masks.py +++ b/scripts/flowcells/barcode_masks.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -import sys -import json import argparse +import json import logging +import sys script_options = { "processing": "processing.json", diff --git a/scripts/flowcells/barcode_report.py b/scripts/flowcells/barcode_report.py index 1901b90c..feba5451 100755 --- a/scripts/flowcells/barcode_report.py +++ b/scripts/flowcells/barcode_report.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -import os -import sys -import re -import json import argparse import glob +import json +import os +import re +import sys script_options = { "quiet": False, diff --git a/scripts/flowcells/link_rapidrun.py b/scripts/flowcells/link_rapidrun.py index 6f6cf3e8..d1497077 100644 --- a/scripts/flowcells/link_rapidrun.py +++ b/scripts/flowcells/link_rapidrun.py @@ -1,12 +1,12 @@ from __future__ import unicode_literals -import os -import sys -import logging -import re -import json import argparse import glob +import json +import logging +import os +import re +import sys log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/flowcells/make_samplesheets.py b/scripts/flowcells/make_samplesheets.py index 5cb82c52..91a34b26 100755 --- a/scripts/flowcells/make_samplesheets.py +++ b/scripts/flowcells/make_samplesheets.py @@ -6,7 +6,6 @@ import re import sys import textwrap - from collections import defaultdict # requires BioPython which seems to be in our environment diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index e663b538..51ee8a00 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -8,11 +8,11 @@ # For each setting (most permissive to least), generate all possible barcodes for each lane # If there are any collisions, check the next tighter mismatch setting -import sys -import re import argparse import itertools import json +import re +import sys MAX_MISMATCH_LEVEL = 1 # Nextseq can allow 2, Hiseq 2500 allows only 1 POSSIBLE_MISMATCH_LEVELS = range(MAX_MISMATCH_LEVEL, -1, -1) diff --git a/scripts/flowcells/test_barcode_masks.py b/scripts/flowcells/test_barcode_masks.py index 65da9e52..bc8dd84e 100644 --- a/scripts/flowcells/test_barcode_masks.py +++ b/scripts/flowcells/test_barcode_masks.py @@ -1,8 +1,7 @@ import random - from typing import List, Tuple -import pytest +import pytest from barcode_masks import get_barcode_masks diff --git a/scripts/helpers/expand_multiple_alignments.py b/scripts/helpers/expand_multiple_alignments.py index eb02ff8b..0098e1f4 100755 --- a/scripts/helpers/expand_multiple_alignments.py +++ b/scripts/helpers/expand_multiple_alignments.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -from collections import namedtuple -import sys import copy +import sys +from collections import namedtuple def process_line(line): diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 039de12b..4dc207c2 100644 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -1,9 +1,10 @@ """This script is deprecated!""" -import os -import sys import argparse import logging +import os +import sys + import requests try: diff --git a/scripts/lims/aggregation/get_files.py b/scripts/lims/aggregation/get_files.py index 6d857c8c..fa6a72da 100644 --- a/scripts/lims/aggregation/get_files.py +++ b/scripts/lims/aggregation/get_files.py @@ -1,9 +1,10 @@ +import argparse +import logging import os +import string import sys -import logging + import requests -import argparse -import string token = None headers = None diff --git a/scripts/lims/alignment/get_files.py b/scripts/lims/alignment/get_files.py index b4a08960..b0560b26 100644 --- a/scripts/lims/alignment/get_files.py +++ b/scripts/lims/alignment/get_files.py @@ -1,9 +1,10 @@ +import argparse +import logging import os +import string import sys -import logging + import requests -import argparse -import string token = None headers = None diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py index f930fb0e..f5f23aec 100644 --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -1,7 +1,6 @@ -import json import argparse +import json import logging - from collections import defaultdict log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/lims/get_processing.py b/scripts/lims/get_processing.py index 2744a991..92435b2c 100644 --- a/scripts/lims/get_processing.py +++ b/scripts/lims/get_processing.py @@ -1,11 +1,12 @@ from __future__ import unicode_literals +import argparse +import json +import logging import os import sys -import logging + import requests -import json -import argparse log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/lims/movetag.py b/scripts/lims/movetag.py index 2e25e3c7..3a1a89e8 100644 --- a/scripts/lims/movetag.py +++ b/scripts/lims/movetag.py @@ -1,8 +1,9 @@ +import argparse +import logging import os import sys -import logging + import requests -import argparse token = None headers = None diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py index d402f268..02a3c9dc 100644 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -1,7 +1,6 @@ -import sys import argparse - import logging +import sys from stamlims_api.rest import setup_api diff --git a/scripts/rename_by_prefix.py b/scripts/rename_by_prefix.py index f741f462..fc30ac69 100644 --- a/scripts/rename_by_prefix.py +++ b/scripts/rename_by_prefix.py @@ -1,7 +1,7 @@ -import os import glob -import sys +import os import re +import sys if len(sys.argv) != 3: print("Usage: rename_by_prefix OLD_CONTENT NEW_CONTENT") diff --git a/scripts/umi/extract_umt.py b/scripts/umi/extract_umt.py index ee66e634..d3162355 100755 --- a/scripts/umi/extract_umt.py +++ b/scripts/umi/extract_umt.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -from Bio import SeqIO import argparse import itertools import logging import sys +from Bio import SeqIO + log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" UMI_LEN = 6 diff --git a/scripts/umi/fastq_umi_add.py b/scripts/umi/fastq_umi_add.py index 413a15f2..60377da7 100644 --- a/scripts/umi/fastq_umi_add.py +++ b/scripts/umi/fastq_umi_add.py @@ -1,7 +1,7 @@ #!/bin/env python3 -import sys import gzip +import sys def transform_line(line): diff --git a/scripts/utility/movesymlinks.py b/scripts/utility/movesymlinks.py index 9ae0fdff..eaf44466 100644 --- a/scripts/utility/movesymlinks.py +++ b/scripts/utility/movesymlinks.py @@ -5,10 +5,10 @@ base path of Y. """ -import sys -import os import argparse import logging +import os +import sys log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/scripts/utility/picard_inserts_process.py b/scripts/utility/picard_inserts_process.py index 12be3a1b..4cbc1db2 100644 --- a/scripts/utility/picard_inserts_process.py +++ b/scripts/utility/picard_inserts_process.py @@ -1,8 +1,8 @@ -import numpy import sys - from copy import copy +import numpy + # returns the convolutional or statistical autocorrelation of the passed histogram # unused/untested currently From 109d348b98f41818458f3e23acc9f3501eafb21e Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 12:37:18 -0700 Subject: [PATCH 151/172] style: fix some possible bugs identifed by ruff --- scripts/altcode/upload_fastq.py | 4 +++ scripts/altcode/upload_stats.py | 8 ++++-- scripts/altseq/upload_data.py | 8 ++++-- .../browser/make_trackhubs_for_flowcell.py | 6 ++--- .../make_browser_load.py | 6 ++--- scripts/flowcells/barcode_check.py | 6 +---- scripts/flowcells/demux_fastq.py | 2 +- scripts/helpers/expand_multiple_alignments.py | 25 ++++++++++--------- scripts/lims/upload_aggregation_stats.py | 2 +- scripts/poolprocess.py | 5 +++- scripts/utility/picard_inserts_process.py | 2 +- 11 files changed, 43 insertions(+), 31 deletions(-) diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py index 8bc27516..d57de079 100644 --- a/scripts/altcode/upload_fastq.py +++ b/scripts/altcode/upload_fastq.py @@ -3,6 +3,10 @@ Uploads alt-code fastq files to LIMS """ +# Ignore B019, because we don't care about the upload class leaking into +# memory after use, because we only construct one +# ruff: noqa: B019 + import argparse import datetime import hashlib diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py index 62fc8ffc..6f8ab799 100644 --- a/scripts/altcode/upload_stats.py +++ b/scripts/altcode/upload_stats.py @@ -3,6 +3,10 @@ Uploads all the results of alt-code processing to LIMS """ +# Ignore B019, because we don't care about the upload class leaking into +# memory after use, because we only construct one +# ruff: noqa: B019 + import argparse import datetime import glob @@ -505,7 +509,7 @@ def upload_flowcell_report(self, data): else: # Error! too many reports LOG.critical("Too many JSON reports exist") - raise "Too many JSON reports exist, exiting" + raise Exception("Too many JSON reports exist, exiting") def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): """ @@ -526,7 +530,7 @@ def upload_altcode_flowcell(self, sample_config, processing_dict, outdir): lane = int(row["lane"]) pool_name = row["pool_name"] sample_name = row["sample_name"] - for idx, lib in enumerate(processing_dict["libraries"]): + for lib in processing_dict["libraries"]: if int(lib["lane"]) == lane and lib["barcode_index"] == barcode_index: lib.update({"pool_name": pool_name, "sample_name": sample_name}) processing_info.append(lib) diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py index 51db031c..0a8a7710 100644 --- a/scripts/altseq/upload_data.py +++ b/scripts/altseq/upload_data.py @@ -3,6 +3,10 @@ Uploads all the results of alt-seq processing to LIMS """ +# Ignore B019, because we don't care about the upload class leaking into +# memory after use, because we only construct one +# ruff: noqa: B019 + import argparse import csv import datetime @@ -501,7 +505,7 @@ def upload_flowcell_report(self, data): else: # Error! too many reports LOG.critical("Too many JSON reports exist") - raise "Too many JSON reports exist, exiting" + raise Exception("Too many JSON reports exist, exiting") def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): """ @@ -522,7 +526,7 @@ def upload_altseq_flowcell(self, sample_config, processing_dict, outdir): lane = int(row["lane"]) pool_name = row["pool_name"] sample_name = row["sample_name"] - for idx, lib in enumerate(processing_dict["libraries"]): + for lib in processing_dict["libraries"]: if int(lib["lane"]) == lane and lib["barcode_index"] == barcode_index: lib.update({"pool_name": pool_name, "sample_name": sample_name}) processing_info.append(lib) diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index 34678ef2..fbddf8a1 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -247,7 +247,7 @@ def create_genomestxt(self): genomefile = os.path.join(self.outdir, "genomes.txt") logging.info("Creating genome.txt file: %s" % genomefile) genomes = open(genomefile, "w") - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): genomes.write("\ngenome %s\n" % hgdb) genomes.write( "trackDb %s/trackDb.%s.%s.txt\n" % (hgdb, self.project, self.main_label) @@ -401,7 +401,7 @@ def create_htmls(self): self.html_files = {} masterhtmlloc = os.path.join(self.outdir, "description.html") masterhtml = open(masterhtmlloc, "w") - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): self.html_files[hgdb] = os.path.join( self.outdir, hgdb, "%s.html" % self.main_label ) @@ -445,7 +445,7 @@ def create_html(self, hgdb, file): def create_ras(self): self.ra_files = {} - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): self.create_ra(hgdb) # write RA / track file diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index 90016f73..824b5d0a 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -368,7 +368,7 @@ def prepare_tracks(self): def create_htmls(self): self.html_files = {} - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): self.create_html(hgdb) def create_html(self, hgdb): @@ -416,7 +416,7 @@ def create_html(self, hgdb): def create_ras(self): self.ra_files = {} - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): self.create_ra(hgdb) def create_commands(self): @@ -439,7 +439,7 @@ def create_commands(self): % {"base_dir": self.basedir, "link_dir": self.link_dir} ) - for hgdb, subtracks in self.subtrack_sets.items(): + for hgdb in self.subtrack_sets.keys(): self.create_genome_commands(hgdb, commands) # commands.write("\ncat %s >> %s\n" % (self.excludes_file, self.browser_excludes_file)) diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py index 0239f193..f78f4a90 100644 --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -128,13 +128,9 @@ def main(argv): lane = lib["lane"] for lane in barcodes["Lanes"]: if lane == lane["LaneIndex"]: - if bcs in lane["Counts"]: - next - else: + if bcs not in lane["Counts"]: print(lib) success = "FALSE" - else: - next print(success) diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py index ad4188f7..562455fe 100644 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -329,7 +329,7 @@ def main(argv): for label, info in labels.items(): print("%s\t%s" % (label, str(info))) - for label, info in labels.items(): + for info in labels.values(): info["out"].communicate() info["fh"].close() diff --git a/scripts/helpers/expand_multiple_alignments.py b/scripts/helpers/expand_multiple_alignments.py index 0098e1f4..37f7907f 100755 --- a/scripts/helpers/expand_multiple_alignments.py +++ b/scripts/helpers/expand_multiple_alignments.py @@ -165,23 +165,24 @@ def valid_pair(i, j): sys.stdout.write(r2) return - if s1 and s2: - # print("r1", r1) - # print("r2", r2) - # print("s1", s1) - # print("s2", s2) - # combinations = [] - for i in s1: - for j in s2: - pass - - return + # I'm not sure what this block was for. + # if s1 and s2: + # # print("r1", r1) + # # print("r2", r2) + # # print("s1", s1) + # # print("s2", s2) + # combinations = [] + # for i in s1: + # for j in s2: + # pass + + # return # print("r1", r1) # print("r2", r2) -def main(args=[]): +def main(): while True: try: r1 = next(sys.stdin) diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py index 02a3c9dc..9efbb5f6 100644 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -41,7 +41,7 @@ def parser_setup(): # aggregation -def upload_stats(api, aggregation, stats={}): +def upload_stats(api, aggregation, stats): data = [ { "object_id": aggregation, diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index c2d5eb23..013d5607 100644 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,4 +1,7 @@ -# import csv +# Ignore B019, because we don't care about the upload class leaking into +# memory after use, because we only construct one +# ruff: noqa: B019 + import argparse import functools import json diff --git a/scripts/utility/picard_inserts_process.py b/scripts/utility/picard_inserts_process.py index 4cbc1db2..f39a4dc9 100644 --- a/scripts/utility/picard_inserts_process.py +++ b/scripts/utility/picard_inserts_process.py @@ -119,7 +119,7 @@ def read_hist(file): (key, val) = line.split() d[int(key)] = int(val) except Exception: - next + pass return d From daee1337202d4870b8c6fae87372536d318147b2 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 12:39:39 -0700 Subject: [PATCH 152/172] style: Make sure to use raw strings for regex --- scripts/browser/make_trackhubs_for_flowcell.py | 2 +- scripts/browser/make_trackhubs_for_projects.py | 2 +- scripts/browser/old_native_fc_loading/make_browser_load.py | 2 +- scripts/copy_notify.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index fbddf8a1..bcedd887 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -44,7 +44,7 @@ def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, # after removing leading/trailing whitespace. - output = re.sub("[^\w$]", "_", input.strip()) + output = re.sub(r"[^\w$]", "_", input.strip()) return output diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index 519c6e5e..0623cd54 100644 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -46,7 +46,7 @@ def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, # after removing leading/trailing whitespace. - output = re.sub("[^\w$]", "_", input.strip()) + output = re.sub(r"[^\w$]", "_", input.strip()) return output diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index 824b5d0a..bb84b640 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -44,7 +44,7 @@ def mysql_clean(input): # Mysql names can contain only 0-9, a-z, A-Z, _, or $ # So we replace all other characters with an underscore, # after removing leading/trailing whitespace. - output = re.sub("[^\w$]", "_", input.strip()) + output = re.sub(r"[^\w$]", "_", input.strip()) return output diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index d47ce734..4e9c32e5 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -40,16 +40,16 @@ # Format of folders: 090810_SOLEXA-1GA-1_0016_FC82IU folder_pattern_ga = re.compile( - "(?P\d{6})_SOLEXA-1GA-[12]_\d{4,5}_FC(?P[A-Z0-9]{5})" + r"(?P\d{6})_SOLEXA-1GA-[12]_\d{4,5}_FC(?P[A-Z0-9]{5})" ) # 140703_SN373_0524_BC6TATACXX # 140710_D00453_0080_AC5PBPANXX folder_pattern_hiseq = re.compile( - "(?P\d{6})_(?P(SN|D)\d+)_[0-9]+_(A|B)(?P[A-Z0-9]{5})[A-Z]{2}XX" + r"(?P\d{6})_(?P(SN|D)\d+)_[0-9]+_(A|B)(?P[A-Z0-9]{5})[A-Z]{2}XX" ) # 140808_NS500372_0009_AH115HBGXX folder_pattern_nextseq = re.compile( - "(?P\d{6})_(?PNS500\d+)_[0-9]+_A(?P[A-Z0-9]{5})[A-Z]{2}XX" + r"(?P\d{6})_(?PNS500\d+)_[0-9]+_A(?P[A-Z0-9]{5})[A-Z]{2}XX" ) # To use with datetime.strptime() to translate folder dates From 2738b493a23999e3bd4347d98acfb10557dd06e6 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 12:45:42 -0700 Subject: [PATCH 153/172] style: use pythonic capitalization consistently --- scripts/bam/mark_dups.py | 4 +-- .../browser/make_trackhubs_for_flowcell.py | 8 +++--- .../browser/make_trackhubs_for_projects.py | 8 +++--- .../make_browser_load.py | 18 ++++++------- scripts/bwa/filter_reads.py | 26 +++++++++---------- scripts/flowcells/barcode_check.py | 12 ++++----- scripts/flowcells/barcode_masks.py | 8 +++--- scripts/flowcells/demux_fastq.py | 4 +-- scripts/flowcells/link_rapidrun.py | 6 ++--- scripts/flowcells/max_mismatch.py | 8 +++--- scripts/umi/extract_umt.py | 18 ++++++------- 11 files changed, 60 insertions(+), 60 deletions(-) diff --git a/scripts/bam/mark_dups.py b/scripts/bam/mark_dups.py index 7dbc2b92..c0b46852 100755 --- a/scripts/bam/mark_dups.py +++ b/scripts/bam/mark_dups.py @@ -34,7 +34,7 @@ def parser_setup(): # Takes a list of reads, returns one sorted such that the "best" read is at the # top. That means highest mapping quality, with ties broken by query_name (in # lexicographic order) -def sortQuality(x): +def sort_quality(x): return (-1 * x.mapping_quality, x.query_name) @@ -55,7 +55,7 @@ def find_dups(reads): key = r.template_length lists[key].append(r) - return [sorted(sublist, key=sortQuality) for sublist in lists.values()] + return [sorted(sublist, key=sort_quality) for sublist in lists.values()] # Sets a read's duplicate flag, returns it diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index bcedd887..c44131c0 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -162,10 +162,10 @@ def __init__( def load_config(self, trackhubconfig): import configparser - Config = configparser.ConfigParser() - Config.read(trackhubconfig) - self.trackhubURL = Config.get("browser", "trackhub_url") - self.flowcell_link_folder = Config.get("browser", "flowcell_link_folder") + config = configparser.ConfigParser() + config.read(trackhubconfig) + self.trackhubURL = config.get("browser", "trackhub_url") + self.flowcell_link_folder = config.get("browser", "flowcell_link_folder") def load(self): self.basedir_name = os.path.basename(self.basedir) diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index 0623cd54..b5dae3f6 100644 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -155,10 +155,10 @@ def __init__( def load_config(self, trackhubconfig): import configparser - Config = configparser.ConfigParser() - Config.read(trackhubconfig) - self.trackhubURL = Config.get("browser", "trackhub_url") - self.aggregation_link_folder = Config.get("browser", "aggregation_link_folder") + config = configparser.ConfigParser() + config.read(trackhubconfig) + self.trackhubURL = config.get("browser", "trackhub_url") + self.aggregation_link_folder = config.get("browser", "aggregation_link_folder") def load(self): # set up folder diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index bb84b640..5a1574ad 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -156,15 +156,15 @@ def __init__( def load_config(self, browserconfig): import configparser - Config = configparser.ConfigParser() - Config.read(browserconfig) - self.server = Config.get("browser", "server") - self.browser_url = Config.get("browser", "browser_url") - self.flowcell_link_folder = Config.get("browser", "flowcell_link_folder") - self.track_basedir = Config.get("browser", "track_basedir") - self.browser_excludes_file = Config.get("browser", "browser_excludes_file") - self.group = Config.get("browser", "browser_group") - self.file_label = Config.get("browser", "file_label") + config = configparser.ConfigParser() + config.read(browserconfig) + self.server = config.get("browser", "server") + self.browser_url = config.get("browser", "browser_url") + self.flowcell_link_folder = config.get("browser", "flowcell_link_folder") + self.track_basedir = config.get("browser", "track_basedir") + self.browser_excludes_file = config.get("browser", "browser_excludes_file") + self.group = config.get("browser", "browser_group") + self.file_label = config.get("browser", "file_label") def load(self): # self.browsersheet = SampleSheet(file=self.browsersheet_file) diff --git a/scripts/bwa/filter_reads.py b/scripts/bwa/filter_reads.py index 164c8cf9..85281d5b 100755 --- a/scripts/bwa/filter_reads.py +++ b/scripts/bwa/filter_reads.py @@ -21,7 +21,7 @@ """ -class read_exception(Exception): +class ReadError(Exception): pass @@ -77,11 +77,11 @@ def set_nonnuclear(read, mark=True): def validate_read(read, min_mapq=1, max_mismatches=2): if read.mapping_quality < min_mapq: - raise read_exception("Read MAPQ < %d" % min_mapq) + raise ReadError("Read MAPQ < %d" % min_mapq) if read.is_unmapped: - raise read_exception("Read not mapped") + raise ReadError("Read not mapped") if read.get_tag("NM") > max_mismatches: - raise read_exception("Read mismatches > %d" % max_mismatches) + raise ReadError("Read mismatches > %d" % max_mismatches) return read @@ -175,22 +175,22 @@ def validate_read(read, min_mapq=1, max_mismatches=2): # Read pair must be in F-R configuration if read1.is_reverse == read2.is_reverse: - raise read_exception("Mates cannot align to same strand!") + raise ReadError("Mates cannot align to same strand!") # Both reads must map to same contig if read1.reference_id != read2.reference_id: - raise read_exception("Mates must align to the same reference contig!") + raise ReadError("Mates must align to the same reference contig!") # Insert size must be greater than 0 if read1.template_length == 0 or read2.template_length == 0: - raise read_exception("Insert size cannot be 0!") + raise ReadError("Insert size cannot be 0!") # Insert sizes must add up to zero (one must be positive and the other negative) if read1.template_length + read2.template_length != 0: - raise read_exception("Insert sizes must be equal!") + raise ReadError("Insert sizes must be equal!") # Insert sizes must less than the maximum @@ -198,9 +198,9 @@ def validate_read(read, min_mapq=1, max_mismatches=2): abs(read1.template_length) > args.max_insert_size or read2.template_length > args.max_insert_size ): - raise read_exception("Insert size > %d!" % args.max_insert_size) + raise ReadError("Insert size > %d!" % args.max_insert_size) - except read_exception as e: + except ReadError as e: # If we get a read exception, then set # QC fail flag, and unset proper pair flag @@ -247,12 +247,12 @@ def validate_read(read, min_mapq=1, max_mismatches=2): if read1.is_paired: if not read1.mate_is_unmapped: - raise read_exception("No mate found (incongruent flag)!") + raise ReadError("No mate found (incongruent flag)!") else: - raise read_exception("No mate found!") + raise ReadError("No mate found!") - except read_exception as e: + except ReadError as e: qc_fail = True logging.debug(e) diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py index f78f4a90..b050327e 100644 --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -5,7 +5,7 @@ MAX_BARCODE_LENGTH = 10 -def parseArgs(): +def parse_args(): parser = argparse.ArgumentParser(description="Split up fastq files by barcode") parser.add_argument( "--processing", @@ -83,7 +83,7 @@ def get_barcode_lengths(json_data): # Make sure only 1 report is run each for single/dual indexed barcodes until reporting is more flexible tempbc1, tempbc2 = [], [] - finalList = [] + final_list = [] for n in lengths: if n[2] == "0": @@ -91,15 +91,15 @@ def get_barcode_lengths(json_data): else: tempbc2.append(n) if tempbc1 != []: - finalList.append(sorted(tempbc1)[0]) + final_list.append(sorted(tempbc1)[0]) if tempbc2 != []: - finalList.append(sorted(tempbc2)[0]) + final_list.append(sorted(tempbc2)[0]) - return finalList + return final_list def main(argv): - args = parseArgs() + args = parse_args() barcodes = json.load(open(args.barcodes_file)) process = json.load(open(args.processing_file)) mask = args.barcodes_mask diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py index 252c7cf2..2dbaddeb 100644 --- a/scripts/flowcells/barcode_masks.py +++ b/scripts/flowcells/barcode_masks.py @@ -76,7 +76,7 @@ def format_length(x): # Make sure only 1 report is run each for single/dual indexed barcodes until reporting is more flexible tempbc1, tempbc2 = [], [] - finalList = [] + final_list = [] for n in lengths: if n[2] == "0": @@ -84,11 +84,11 @@ def format_length(x): else: tempbc2.append(n) if tempbc1 != []: - finalList.append(sorted(tempbc1)[0]) + final_list.append(sorted(tempbc1)[0]) if tempbc2 != []: - finalList.append(sorted(tempbc2)[0]) + final_list.append(sorted(tempbc2)[0]) - return finalList + return final_list # Detects if there are barcode collisions per lane diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py index 562455fe..086bb85b 100644 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -18,7 +18,7 @@ lengths = set([]) -def parseArgs(): +def parse_args(): parser = argparse.ArgumentParser(description="Split up fastq files by barcode") parser.add_argument( "--mismatches", type=int, default=0, help="number of mismatches" @@ -290,7 +290,7 @@ def split_file(filename, barcodes, labels): def main(argv): - args = parseArgs() + args = parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG, format=log_format) diff --git a/scripts/flowcells/link_rapidrun.py b/scripts/flowcells/link_rapidrun.py index d1497077..3d728ab1 100644 --- a/scripts/flowcells/link_rapidrun.py +++ b/scripts/flowcells/link_rapidrun.py @@ -75,16 +75,16 @@ def create_links(lane, read, base_dir, dry_run=False): lane1_fastq = glob.glob("%s_%s_*.fastq.gz" % (sample_name, read)) replace = re.compile(r"_L001$") - L2_sample_name = replace.sub("_L002", sample_name) + l2_sample_name = replace.sub("_L002", sample_name) - lane2_fastq = glob.glob("%s_%s_*.fastq.gz" % (L2_sample_name, read)) + lane2_fastq = glob.glob("%s_%s_*.fastq.gz" % (l2_sample_name, read)) lane1_filecount = len(lane1_fastq) lane2_filecount = len(lane2_fastq) for lane2_filenum in range(1, lane2_filecount + 1): effective_filenum = lane1_filecount + lane2_filenum - orig_filename = "%s_%s_%03d.fastq.gz" % (L2_sample_name, read, lane2_filenum) + orig_filename = "%s_%s_%03d.fastq.gz" % (l2_sample_name, read, lane2_filenum) new_filename = "%s_%s_%03d.fastq.gz" % (sample_name, read, effective_filenum) print("Linking %s => %s" % (orig_filename, new_filename)) diff --git a/scripts/flowcells/max_mismatch.py b/scripts/flowcells/max_mismatch.py index 51ee8a00..e86a1675 100755 --- a/scripts/flowcells/max_mismatch.py +++ b/scripts/flowcells/max_mismatch.py @@ -50,11 +50,11 @@ def parser_setup(): def gen_snps(word, mismatches): for d in range(mismatches + 1): for locs in itertools.combinations(range(len(word)), d): - thisWord = [[char] for char in word] + this_word = [[char] for char in word] for loc in locs: - origChar = word[loc] - thisWord[loc] = [letter for letter in "ACGTN" if letter != origChar] - for poss in itertools.product(*thisWord): + orig_char = word[loc] + this_word[loc] = [letter for letter in "ACGTN" if letter != orig_char] + for poss in itertools.product(*this_word): yield "".join(poss) diff --git a/scripts/umi/extract_umt.py b/scripts/umi/extract_umt.py index d3162355..c0b5f105 100755 --- a/scripts/umi/extract_umt.py +++ b/scripts/umi/extract_umt.py @@ -17,7 +17,7 @@ mismatched_stems = set() -def parseArgs(): +def parse_args(): parser = argparse.ArgumentParser(description="Annotate read names with UMT") parser.add_argument( "--mismatches", type=int, default=1, help="number of mismatches" @@ -37,11 +37,11 @@ def parseArgs(): def mismatch(word, mismatches): for d in range(mismatches + 1): for locs in itertools.combinations(range(len(word)), d): - thisWord = [[char] for char in word] + this_word = [[char] for char in word] for loc in locs: - origChar = word[loc] - thisWord[loc] = [letter for letter in "ACGTN" if letter != origChar] - for poss in itertools.product(*thisWord): + orig_char = word[loc] + this_word[loc] = [letter for letter in "ACGTN" if letter != orig_char] + for poss in itertools.product(*this_word): yield "".join(poss) @@ -118,18 +118,18 @@ def setup_mismatches(num_mismatches): def main(argv): - args = parseArgs() + args = parse_args() logging.basicConfig(level=logging.WARN, format=log_format) setup_mismatches(args.mismatches) with open(args.r1_fastq) as r1_in, open(args.r2_fastq) as r2_in, open( args.out_r1, "wt" ) as r1_out, open(args.out_r2, "wt") as r2_out: - r1_seqIO = SeqIO.parse(r1_in, "fastq") - r2_seqIO = SeqIO.parse(r2_in, "fastq") + r1_seq_io = SeqIO.parse(r1_in, "fastq") + r2_seq_io = SeqIO.parse(r2_in, "fastq") try: while True: - (r1, r2) = attach_umt(next(r1_seqIO), next(r2_seqIO)) + (r1, r2) = attach_umt(next(r1_seq_io), next(r2_seq_io)) # Only write Fastq records for which we find stems if r1 is not None and r2 is not None: r1_out.write(r1.format("fastq")) From 1cbe3d614ba9c0e7f7dd59ba08ff81e408607a1b Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 12:51:49 -0700 Subject: [PATCH 154/172] style: executables are chmod +x and have shebang --- scripts/aggregatecollate.py | 1 + scripts/aggregateprocess.py | 1 + scripts/alignprocess.py | 1 + scripts/altcode/upload_fastq.py | 0 scripts/altcode/upload_stats.py | 0 scripts/altseq/upload_data.py | 0 scripts/apilaneprocess.py | 1 + scripts/bam/bamfaiordercompare.py | 1 + scripts/bam/move_umt_to_tag.py | 2 +- scripts/bam/random_reads.py | 1 + scripts/browser/make_trackhubs_for_projects.py | 0 scripts/browser/parse_all_projects.py | 1 + scripts/bwa/aggregate/basic/sparse_motifs.py | 1 + scripts/bwa/bamcounts.py | 0 scripts/bwa/filter_reads.py | 2 +- scripts/bwa/fix_bam_pairing.py | 2 +- scripts/cluster/monitor_alignments.py | 1 + scripts/create_processing.py | 1 + scripts/flowcells/barcode_check.py | 1 + scripts/flowcells/barcode_count_from_stats_file.py | 0 scripts/flowcells/barcode_masks.py | 0 scripts/flowcells/demux_fastq.py | 1 + scripts/flowcells/link_nextseq.py | 0 scripts/flowcells/link_rapidrun.py | 1 + scripts/flowcells/test_barcode_masks.py | 1 + scripts/laneprocess.py | 1 + scripts/lims/aggregation/get_files.py | 1 + scripts/lims/alignment/get_files.py | 1 + scripts/lims/create_altseq_sample_config.py | 1 + scripts/lims/get_processing.py | 1 + scripts/lims/movetag.py | 1 + scripts/lims/upload_aggregation_stats.py | 1 + scripts/lims/upload_data.py | 1 + scripts/poolprocess.py | 1 + scripts/rename_by_prefix.py | 1 + scripts/umi/fastq_umi_add.py | 2 +- scripts/utility/md5check.py | 1 + scripts/utility/movesymlinks.py | 0 scripts/utility/picard_inserts_process.py | 1 + scripts/versions.py | 1 + 40 files changed, 31 insertions(+), 4 deletions(-) mode change 100644 => 100755 scripts/aggregatecollate.py mode change 100644 => 100755 scripts/aggregateprocess.py mode change 100644 => 100755 scripts/alignprocess.py mode change 100644 => 100755 scripts/altcode/upload_fastq.py mode change 100644 => 100755 scripts/altcode/upload_stats.py mode change 100644 => 100755 scripts/altseq/upload_data.py mode change 100644 => 100755 scripts/apilaneprocess.py mode change 100644 => 100755 scripts/bam/bamfaiordercompare.py mode change 100644 => 100755 scripts/bam/random_reads.py mode change 100644 => 100755 scripts/browser/make_trackhubs_for_projects.py mode change 100644 => 100755 scripts/browser/parse_all_projects.py mode change 100644 => 100755 scripts/bwa/aggregate/basic/sparse_motifs.py mode change 100644 => 100755 scripts/bwa/bamcounts.py mode change 100644 => 100755 scripts/cluster/monitor_alignments.py mode change 100644 => 100755 scripts/create_processing.py mode change 100644 => 100755 scripts/flowcells/barcode_check.py mode change 100644 => 100755 scripts/flowcells/barcode_count_from_stats_file.py mode change 100644 => 100755 scripts/flowcells/barcode_masks.py mode change 100644 => 100755 scripts/flowcells/demux_fastq.py mode change 100644 => 100755 scripts/flowcells/link_nextseq.py mode change 100644 => 100755 scripts/flowcells/link_rapidrun.py mode change 100644 => 100755 scripts/flowcells/test_barcode_masks.py mode change 100644 => 100755 scripts/laneprocess.py mode change 100644 => 100755 scripts/lims/aggregation/get_files.py mode change 100644 => 100755 scripts/lims/alignment/get_files.py mode change 100644 => 100755 scripts/lims/create_altseq_sample_config.py mode change 100644 => 100755 scripts/lims/get_processing.py mode change 100644 => 100755 scripts/lims/movetag.py mode change 100644 => 100755 scripts/lims/upload_aggregation_stats.py mode change 100644 => 100755 scripts/lims/upload_data.py mode change 100644 => 100755 scripts/poolprocess.py mode change 100644 => 100755 scripts/rename_by_prefix.py mode change 100644 => 100755 scripts/umi/fastq_umi_add.py mode change 100644 => 100755 scripts/utility/md5check.py mode change 100644 => 100755 scripts/utility/movesymlinks.py mode change 100644 => 100755 scripts/utility/picard_inserts_process.py mode change 100644 => 100755 scripts/versions.py diff --git a/scripts/aggregatecollate.py b/scripts/aggregatecollate.py old mode 100644 new mode 100755 index 2dcc230a..a15e5299 --- a/scripts/aggregatecollate.py +++ b/scripts/aggregatecollate.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import os diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py old mode 100644 new mode 100755 index cd777db1..429dfed0 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import logging diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py old mode 100644 new mode 100755 index 4a4a788f..f0611535 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import logging diff --git a/scripts/altcode/upload_fastq.py b/scripts/altcode/upload_fastq.py old mode 100644 new mode 100755 diff --git a/scripts/altcode/upload_stats.py b/scripts/altcode/upload_stats.py old mode 100644 new mode 100755 diff --git a/scripts/altseq/upload_data.py b/scripts/altseq/upload_data.py old mode 100644 new mode 100755 diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py old mode 100644 new mode 100755 index 26dc51bb..6b019513 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import collections import logging diff --git a/scripts/bam/bamfaiordercompare.py b/scripts/bam/bamfaiordercompare.py old mode 100644 new mode 100755 index 6ecb0635..64ccdf2b --- a/scripts/bam/bamfaiordercompare.py +++ b/scripts/bam/bamfaiordercompare.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This quick script uses pysam's capabilities to compare the ordering of references in the header of a BAM file with the order in a FAI diff --git a/scripts/bam/move_umt_to_tag.py b/scripts/bam/move_umt_to_tag.py index 43c4c68f..24c32705 100755 --- a/scripts/bam/move_umt_to_tag.py +++ b/scripts/bam/move_umt_to_tag.py @@ -1,4 +1,4 @@ -#!/bin/env python3 +#!/usr/bin/env python3 """ move_umt_to_tag.py Takes a BAM file as input, and produces one as output. diff --git a/scripts/bam/random_reads.py b/scripts/bam/random_reads.py old mode 100644 new mode 100755 index 6d562021..b3a3c0e6 --- a/scripts/bam/random_reads.py +++ b/scripts/bam/random_reads.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 def main(): import argparse import random diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py old mode 100644 new mode 100755 diff --git a/scripts/browser/parse_all_projects.py b/scripts/browser/parse_all_projects.py old mode 100644 new mode 100755 index 481438da..fc5e9ac4 --- a/scripts/browser/parse_all_projects.py +++ b/scripts/browser/parse_all_projects.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # pulls out top 1000 projects and their ids from __future__ import unicode_literals diff --git a/scripts/bwa/aggregate/basic/sparse_motifs.py b/scripts/bwa/aggregate/basic/sparse_motifs.py old mode 100644 new mode 100755 index 942afb53..869eba8b --- a/scripts/bwa/aggregate/basic/sparse_motifs.py +++ b/scripts/bwa/aggregate/basic/sparse_motifs.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import sys from sklearn.datasets import dump_svmlight_file diff --git a/scripts/bwa/bamcounts.py b/scripts/bwa/bamcounts.py old mode 100644 new mode 100755 diff --git a/scripts/bwa/filter_reads.py b/scripts/bwa/filter_reads.py index 85281d5b..f8f145d8 100755 --- a/scripts/bwa/filter_reads.py +++ b/scripts/bwa/filter_reads.py @@ -1,4 +1,4 @@ -#!/bin/env python3 +#!/usr/bin/env python3 """ filter_reads.py - Set SAM flag 0x200 (QC-fail) for reads failing various criteria. diff --git a/scripts/bwa/fix_bam_pairing.py b/scripts/bwa/fix_bam_pairing.py index 3b787fa8..f238ad14 100755 --- a/scripts/bwa/fix_bam_pairing.py +++ b/scripts/bwa/fix_bam_pairing.py @@ -1,4 +1,4 @@ -#!/bin/env python3 +#!/usr/bin/env python3 import argparse diff --git a/scripts/cluster/monitor_alignments.py b/scripts/cluster/monitor_alignments.py old mode 100644 new mode 100755 index c1c965c4..78f6c9a3 --- a/scripts/cluster/monitor_alignments.py +++ b/scripts/cluster/monitor_alignments.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import os diff --git a/scripts/create_processing.py b/scripts/create_processing.py old mode 100644 new mode 100755 index 5d1250fb..a76d2e67 --- a/scripts/create_processing.py +++ b/scripts/create_processing.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import logging diff --git a/scripts/flowcells/barcode_check.py b/scripts/flowcells/barcode_check.py old mode 100644 new mode 100755 index b050327e..4a4e412b --- a/scripts/flowcells/barcode_check.py +++ b/scripts/flowcells/barcode_check.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import sys diff --git a/scripts/flowcells/barcode_count_from_stats_file.py b/scripts/flowcells/barcode_count_from_stats_file.py old mode 100644 new mode 100755 diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py old mode 100644 new mode 100755 diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py old mode 100644 new mode 100755 index 086bb85b..452a8062 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # This is a quick script to split up FASTQ files by barcode given # Used to rescue tags from undeterminde state diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py old mode 100644 new mode 100755 diff --git a/scripts/flowcells/link_rapidrun.py b/scripts/flowcells/link_rapidrun.py old mode 100644 new mode 100755 index 3d728ab1..cf2f1da8 --- a/scripts/flowcells/link_rapidrun.py +++ b/scripts/flowcells/link_rapidrun.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals import argparse diff --git a/scripts/flowcells/test_barcode_masks.py b/scripts/flowcells/test_barcode_masks.py old mode 100644 new mode 100755 index bc8dd84e..e16d5c46 --- a/scripts/flowcells/test_barcode_masks.py +++ b/scripts/flowcells/test_barcode_masks.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import random from typing import List, Tuple diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py old mode 100644 new mode 100755 index 4dc207c2..94314a38 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """This script is deprecated!""" import argparse diff --git a/scripts/lims/aggregation/get_files.py b/scripts/lims/aggregation/get_files.py old mode 100644 new mode 100755 index fa6a72da..3ee2ea2d --- a/scripts/lims/aggregation/get_files.py +++ b/scripts/lims/aggregation/get_files.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import os diff --git a/scripts/lims/alignment/get_files.py b/scripts/lims/alignment/get_files.py old mode 100644 new mode 100755 index b0560b26..c96e323b --- a/scripts/lims/alignment/get_files.py +++ b/scripts/lims/alignment/get_files.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import os diff --git a/scripts/lims/create_altseq_sample_config.py b/scripts/lims/create_altseq_sample_config.py old mode 100644 new mode 100755 index f5f23aec..1f3a7b9f --- a/scripts/lims/create_altseq_sample_config.py +++ b/scripts/lims/create_altseq_sample_config.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import logging diff --git a/scripts/lims/get_processing.py b/scripts/lims/get_processing.py old mode 100644 new mode 100755 index 92435b2c..adf14f02 --- a/scripts/lims/get_processing.py +++ b/scripts/lims/get_processing.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import unicode_literals import argparse diff --git a/scripts/lims/movetag.py b/scripts/lims/movetag.py old mode 100644 new mode 100755 index 3a1a89e8..57e43985 --- a/scripts/lims/movetag.py +++ b/scripts/lims/movetag.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import os diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py old mode 100644 new mode 100755 index 9efbb5f6..eae09eb8 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import logging import sys diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py old mode 100644 new mode 100755 index 2597390e..d786e05b --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # pylint disable=invalid-whitespace, invalid-name import argparse diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py old mode 100644 new mode 100755 index 013d5607..2a763d5c --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Ignore B019, because we don't care about the upload class leaking into # memory after use, because we only construct one # ruff: noqa: B019 diff --git a/scripts/rename_by_prefix.py b/scripts/rename_by_prefix.py old mode 100644 new mode 100755 index fc30ac69..293b91c9 --- a/scripts/rename_by_prefix.py +++ b/scripts/rename_by_prefix.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import glob import os import re diff --git a/scripts/umi/fastq_umi_add.py b/scripts/umi/fastq_umi_add.py old mode 100644 new mode 100755 index 60377da7..c7f50557 --- a/scripts/umi/fastq_umi_add.py +++ b/scripts/umi/fastq_umi_add.py @@ -1,4 +1,4 @@ -#!/bin/env python3 +#!/usr/bin/env python3 import gzip import sys diff --git a/scripts/utility/md5check.py b/scripts/utility/md5check.py old mode 100644 new mode 100755 index b2fc6b9e..2c7d01eb --- a/scripts/utility/md5check.py +++ b/scripts/utility/md5check.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ For a tab delineated file with rows of: diff --git a/scripts/utility/movesymlinks.py b/scripts/utility/movesymlinks.py old mode 100644 new mode 100755 diff --git a/scripts/utility/picard_inserts_process.py b/scripts/utility/picard_inserts_process.py old mode 100644 new mode 100755 index f39a4dc9..a2b91178 --- a/scripts/utility/picard_inserts_process.py +++ b/scripts/utility/picard_inserts_process.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import sys from copy import copy diff --git a/scripts/versions.py b/scripts/versions.py old mode 100644 new mode 100755 index f62520dd..88b16ac3 --- a/scripts/versions.py +++ b/scripts/versions.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This script uses pip to print out versions of all installed packages. """ From 58d776c2ba16041203b28d53633358a94f7f43bb Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 11 Jul 2024 13:36:50 -0700 Subject: [PATCH 155/172] style: logging-related changes The main thing changed here is to prefer the form `logging.info("msg %s %s", arg1, arg2)` This allows the logger to do the interpolation, which can save time if the message is not printed because it is below the current log level. --- .gitignore | 1 + pyproject.toml | 20 ++- scripts/aggregatecollate.py | 55 +++---- scripts/aggregateprocess.py | 140 ++++++++++-------- scripts/alignprocess.py | 68 ++++----- scripts/apilaneprocess.py | 36 ++--- .../browser/make_trackhubs_for_flowcell.py | 69 +++++---- .../browser/make_trackhubs_for_projects.py | 25 ++-- .../make_browser_load.py | 73 ++++----- scripts/cluster/monitor_alignments.py | 32 ++-- scripts/copy_notify.py | 23 +-- scripts/create_processing.py | 22 ++- scripts/flowcells/barcode_masks.py | 4 +- scripts/flowcells/demux_fastq.py | 26 ++-- scripts/flowcells/link_nextseq.py | 2 +- scripts/laneprocess.py | 24 +-- scripts/lims/aggregation/get_files.py | 52 ++++--- scripts/lims/alignment/get_files.py | 58 ++++---- scripts/lims/get_processing.py | 26 ++-- scripts/lims/upload_aggregation_stats.py | 16 +- scripts/lims/upload_data.py | 136 ++++++++--------- scripts/poolprocess.py | 70 ++++----- scripts/utility/md5check.py | 8 +- scripts/utility/movesymlinks.py | 24 +-- 24 files changed, 533 insertions(+), 477 deletions(-) diff --git a/.gitignore b/.gitignore index 1f03248a..ad9f65f9 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ environments/ !.gitattributes !.gitmodules !.pylintrc +!.pre-commit-config.yaml work genome_build diff --git a/pyproject.toml b/pyproject.toml index 1479e977..92f9c0fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,25 @@ requires-python = ">= 3.5" target-version = "py37" [tool.ruff.lint] -ignore = [] +select = [ + # Default ruff errors + "E4", + "E7", + "E9", + "F", + "B", # Bugs from flake8-bugbear + "W605", # Only warning that isn't fixed by formatting + "I", # Sorting of imports + "N", # misleading naming conventions + "EXE", # executable-related things + "G", # logging-related changes + "RUF200", # Check pyproject.toml for validity +] +ignore = [ + # We run on python 3.5, so disable suggestions to use f-strings + "UP031", + "UP032", +] #[build-system] diff --git a/scripts/aggregatecollate.py b/scripts/aggregatecollate.py index a15e5299..d34f1c08 100755 --- a/scripts/aggregatecollate.py +++ b/scripts/aggregatecollate.py @@ -140,7 +140,7 @@ def get_aggregation_info(self, aggregation_id): if not results: logging.error( - "Could not find information for aggregation %d" % aggregation_id + "Could not find information for aggregation %d", aggregation_id ) return None @@ -152,15 +152,14 @@ def get_aggregation_lanes(self, aggregation_id): ) if not results: - logging.error("Could not find lanes for aggregation %d" % aggregation_id) + logging.error("Could not find lanes for aggregation %d", aggregation_id) return [] return results def get_lane_fastq_file(self, aggregation_id, lane_id, file_purpose): logging.info( - "Fetching files for alignment %d (Aggregation %d)" - % (lane_id, aggregation_id) + "Fetching files for alignment %d (Aggregation %d)", lane_id, aggregation_id ) results = files.get_object_files( @@ -176,8 +175,10 @@ def get_lane_fastq_file(self, aggregation_id, lane_id, file_purpose): if len(results) != 1: logging.error( - "Found %d files for alignment %d, require 1 (Aggregation %d)" - % (len(results), lane_id, aggregation_id) + "Found %d files for alignment %d, require 1 (Aggregation %d)", + len(results), + lane_id, + aggregation_id, ) logging.error(results) return None @@ -189,14 +190,15 @@ def get_library_info(self, aggregation_info): library_info = self.api.single_result(url=aggregation_info["library"]) if not library_info: logging.critical( - "Cannot proceed without library! Could not get info from %s (Aggregation %d)" - % (aggregation_info["library"], aggregation_info["id"]) + "Cannot proceed without library! Could not get info from %s (Aggregation %d)", + aggregation_info["library"], + aggregation_info["id"], ) sys.exit(1) return library_info def get_script_template(self, script_template): - logging.info("Using script template %s" % script_template) + logging.info("Using script template %s", script_template) return open(script_template, "r").read() def get_example_flowcell(self, aggregation_id, aggregation_lanes): @@ -215,16 +217,18 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): if not lane: logging.critical( - "Was not able to fetch lane %s (Aggregation %d)" - % (aggregation_lane["lane"], aggregation_id) + "Was not able to fetch lane %s (Aggregation %d)", + aggregation_lane["lane"], + aggregation_id, ) sys.exit(1) flowcell = self.api.single_result(url=lane["flowcell"]) if not flowcell: logging.critical( - "Could not get flowcell at %d (Aggregation %d)" - % (lane["flowcell"], aggregation_id) + "Could not get flowcell at %d (Aggregation %d)", + lane["flowcell"], + aggregation_id, ) sys.exit(1) @@ -280,7 +284,7 @@ def setup_aggregation(self, aggregation_id): aggregation_folder = self.get_aggregation_directory(aggregation) # flowcell = self.get_example_flowcell(aggregation_id, aggregation_lanes) - logging.info("Aggregation %d folder: %s" % (aggregation_id, aggregation_folder)) + logging.info("Aggregation %d folder: %s", aggregation_id, aggregation_folder) logging.debug(aggregation) missing = False @@ -291,15 +295,16 @@ def setup_aggregation(self, aggregation_id): lane_id = int(aggregation_lane["lane"].strip("/").split("/")[-1]) if not aggregation_lane["include"]: logging.info( - "Not including lane %s (Aggregation %d)" % (lane_id, aggregation_id) + "Not including lane %s (Aggregation %d)", lane_id, aggregation_id ) continue alignment_endpoint = aggregation_lane["alignment"] if not alignment_endpoint: logging.info( - "Not including lane %s because no alignment set (Aggregation %d)" - % (lane_id, aggregation_id) + "Not including lane %s because no alignment set (Aggregation %d)", + lane_id, + aggregation_id, ) # alignment_id = int(alignment_endpoint.strip("/").split("/")[-1]) @@ -309,14 +314,12 @@ def setup_aggregation(self, aggregation_id): if not r1_fastq or not r2_fastq: logging.critical( - "Missing either R1: %s or R2: %s for alignment %s for lane %s, skipping (Aggregation %d)" - % ( - str(r1_fastq), - str(r2_fastq), - alignment_endpoint, - lane_id, - aggregation_id, - ) + "Missing either R1: %s or R2: %s for alignment %s for lane %s, skipping (Aggregation %d)", + str(r1_fastq), + str(r2_fastq), + alignment_endpoint, + lane_id, + aggregation_id, ) missing = True continue @@ -346,7 +349,7 @@ def setup_aggregation(self, aggregation_id): file_record.close() script_file = os.path.join(aggregation_folder, self.qsub_scriptname) - logging.info("Creating script file %s" % script_file) + logging.info("Creating script file %s", script_file) script = open(script_file, "w") script.write("export AGGREGATION_ID=%d\n" % aggregation_id) diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index 429dfed0..dbb28065 100755 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -174,7 +174,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -186,7 +186,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = self.session.get(url) @@ -207,7 +207,7 @@ def get_aggregation_info(self, aggregation_id): if not results: logging.error( - "Could not find information for aggregation %d" % aggregation_id + "Could not find information for aggregation %d", aggregation_id ) return None @@ -232,8 +232,9 @@ def set_aggregation_folder(self, aggregation_info, library_info): if len(results) > 1: logging.error( - "Found %d folders for aggregation %d, require 1" - % (len(results), aggregation_info["id"]) + "Found %d folders for aggregation %d, require 1", + len(results), + aggregation_info["id"], ) if len(results) == 0: @@ -254,7 +255,7 @@ def set_aggregation_folder(self, aggregation_info, library_info): path = os.path.join(self.aggregation_base_directory, dir_name) - logging.info("Setting aggregation folder to %s" % path) + logging.info("Setting aggregation folder to %s", path) data = { "content_type": "%s/content_type/%d/" @@ -269,8 +270,8 @@ def set_aggregation_folder(self, aggregation_info, library_info): if not new_result.ok: logging.critical(new_result) logging.critical( - "Could not upload new aggregation folder path to LIMS: %s" - % json.dumps(data) + "Could not upload new aggregation folder path to LIMS: %s", + json.dumps(data), ) sys.exit(1) @@ -284,7 +285,7 @@ def get_aggregation_lanes(self, aggregation_id): ) if not results: - logging.error("Could not find lanes for aggregation %d" % aggregation_id) + logging.error("Could not find lanes for aggregation %d", aggregation_id) return [] return results @@ -297,8 +298,10 @@ def get_lane_alignments_file(self, aggregation_id, alignment_id): if len(results) != 1: logging.error( - "Found %d files for alignment %d, require 1 (Aggregation %d)" - % (len(results), alignment_id, aggregation_id) + "Found %d files for alignment %d, require 1 (Aggregation %d)", + len(results), + alignment_id, + aggregation_id, ) logging.error(results) return None @@ -314,8 +317,10 @@ def get_trimmed_fastq_r1(self, aggregation_id, alignment_id): if len(results) != 1: logging.error( - "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" - % (len(results), alignment_id, aggregation_id) + "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)", + len(results), + alignment_id, + aggregation_id, ) logging.error(results) return None @@ -330,8 +335,10 @@ def get_trimmed_fastq_r2(self, aggregation_id, alignment_id): if len(results) != 1: logging.error( - "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)" - % (len(results), alignment_id, aggregation_id) + "Found %d trimmed FQ files for alignment %d, require 1 (Aggregation %d)", + len(results), + alignment_id, + aggregation_id, ) logging.error(results) return None @@ -342,8 +349,9 @@ def get_library_info(self, aggregation_info): library_info = self.api_single_result(url=aggregation_info["library"]) if not library_info: logging.critical( - "Cannot proceed without library! Could not get info from %s (Aggregation %d)" - % (aggregation_info["library"], aggregation_info["id"]) + "Cannot proceed without library! Could not get info from %s (Aggregation %d)", + aggregation_info["library"], + aggregation_info["id"], ) sys.exit(1) return library_info @@ -354,8 +362,9 @@ def get_sample_info(self, aggregation_info): ) if not sample_info: logging.critical( - "Cannot proceed without sample! Could not get info from %s (Aggregation %d)" - % (aggregation_info["sample"], aggregation_info["id"]) + "Cannot proceed without sample! Could not get info from %s (Aggregation %d)", + aggregation_info["sample"], + aggregation_info["id"], ) sys.exit(1) return sample_info @@ -364,7 +373,7 @@ def get_genome_index(self, aggregation_info): genome_info = self.api_single_result(url=aggregation_info["genome_index"]) if not genome_info: logging.critical( - "Could not get genome info! (Aggregation %d)" % aggregation_info["id"] + "Could not get genome info! (Aggregation %d)", aggregation_info["id"] ) sys.exit(1) return genome_info @@ -378,8 +387,7 @@ def get_genome_index_location(self, aggregation_id, aggregation_lanes): if "alignment" not in aggregation_lane or not aggregation_lane["alignment"]: logging.critical( - "No alignment set for included aggregation lane %s" - % str(aggregation_lane) + "No alignment set for included aggregation lane %s", aggregation_lane ) sys.exit(1) @@ -387,16 +395,18 @@ def get_genome_index_location(self, aggregation_id, aggregation_lanes): if not alignment: logging.critical( - "Was not able to fetch alignment %s! (Aggregation %d)" - % (aggregation_lane["alignment"], aggregation_id) + "Was not able to fetch alignment %s! (Aggregation %d)", + aggregation_lane["alignment"], + aggregation_id, ) sys.exit(1) genome_location = self.api_single_result(url=alignment["genome_index_location"]) if not genome_location: logging.critical( - "Could not get genome location from alignment %d! (Aggregation %d)" - % (included["id"], aggregation_id) + "Could not get genome location from alignment %d! (Aggregation %d)", + included["id"], + aggregation_id, ) sys.exit(1) @@ -410,22 +420,20 @@ def get_script_template( self, aggregation_id, process_template_url, script_template=None ): if script_template: - logging.info("Using script template %s" % script_template) + logging.info("Using script template %s", script_template) return (open(script_template, "r").read(), None) if not process_template_url: - logging.critical( - "No process template for aggregation %d\n" % aggregation_id - ) + logging.critical("No process template for aggregation %d\n", aggregation_id) return None - logging.info("Getting process template %s" % process_template_url) + logging.info("Getting process template %s", process_template_url) process_template = self.api_single_result(url=process_template_url) if not process_template: logging.critical( - "Could not find processing template for %s\n" % process_template_url + "Could not find processing template for %s\n", process_template_url ) return None @@ -450,16 +458,18 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): if not lane: logging.critical( - "Was not able to fetch lane %s (Aggregation %d)" - % (aggregation_lane["lane"], aggregation_id) + "Was not able to fetch lane %s (Aggregation %d)", + aggregation_lane["lane"], + aggregation_id, ) sys.exit(1) flowcell = self.api_single_result(url=lane["flowcell"]) if not flowcell: logging.critical( - "Could not get flowcell at %s (Aggregation %d)" - % (lane["flowcell"], aggregation_id) + "Could not get flowcell at %s (Aggregation %d)", + lane["flowcell"], + aggregation_id, ) sys.exit(1) @@ -472,15 +482,17 @@ def get_all_flowcell_paired(self, aggregation_id, aggregation_lanes): lane = self.api_single_result(url=aggregation_lane["lane"]) if not lane: logging.critical( - "Was not able to fetch lane %s (Aggregation %d)" - % (aggregation_lane["lane"], aggregation_id) + "Was not able to fetch lane %s (Aggregation %d)", + aggregation_lane["lane"], + aggregation_id, ) sys.exit(1) flowcell = self.api_single_result(url=lane["flowcell"]) if not flowcell: logging.critical( - "Could not get flowcell at %s (Aggregation %d)" - % (lane["flowcell"], aggregation_id) + "Could not get flowcell at %s (Aggregation %d)", + lane["flowcell"], + aggregation_id, ) sys.exit(1) if not flowcell["paired_end"]: @@ -491,7 +503,7 @@ def get_category_for_assay(self, assay_url): assay_info = self.api_single_result(url=assay_url) category_url = assay_info["category"] if category_url is None: - logging.warn("Assay %s has no category" % (assay_info)) + logging.warning("Assay %s has no category", assay_info) return None category_info = self.api_single_result(url=category_url) return category_info["slug"] @@ -528,19 +540,19 @@ def setup_tag(self, tag_slug): ) def setup_project(self, project_id): - logging.info("Setting up project #%s" % project_id) + logging.info("Setting up project #%s", project_id) aggregations = self.api_list_result( "aggregation/?library__sample__project=%s" % project_id ) self.setup_aggregations([a["id"] for a in aggregations]) def setup_flowcell(self, flowcell_label): - logging.info("Setting up flowcell %s" % flowcell_label) + logging.info("Setting up flowcell %s", flowcell_label) aggregations = self.api_list_result( "aggregation/?in_flowcell=%s" % flowcell_label ) if not aggregations: - logging.error("%s has no aggregations" % flowcell_label) + logging.error("%s has no aggregations", flowcell_label) self.setup_aggregations([a["id"] for a in aggregations]) def setup_aggregations(self, aggregation_ids): @@ -554,7 +566,7 @@ def try_setup(agg_id): try: self.setup_aggregation(agg_id) except Exception: - logging.exception("Something went wrong for AG%d" % agg_id) + logging.exception("Something went wrong for AG%d", agg_id) list(self.pool.map(try_setup, aggregation_ids)) @@ -565,7 +577,7 @@ def setup_aggregation(self, aggregation_id): return False if aggregation["locked"]: - logging.warn("Refusing to set up locked aggregation %d" % (aggregation_id)) + logging.warning("Refusing to set up locked aggregation %d", aggregation_id) return False aggregation_lanes = self.get_aggregation_lanes(aggregation_id) @@ -585,7 +597,7 @@ def setup_aggregation(self, aggregation_id): assay_category = self.get_category_for_assay(sample_info["assay"]) - logging.info("Aggregation %d folder: %s" % (aggregation_id, aggregation_folder)) + logging.info("Aggregation %d folder: %s", aggregation_id, aggregation_folder) logging.debug(aggregation) missing = False @@ -594,16 +606,18 @@ def setup_aggregation(self, aggregation_id): for aggregation_lane in aggregation_lanes: if not aggregation_lane["include"]: logging.info( - "Not including lane %s (Aggregation %d)" - % (aggregation_lane["lane"], aggregation_id) + "Not including lane %s (Aggregation %d)", + aggregation_lane["lane"], + aggregation_id, ) continue alignment_endpoint = aggregation_lane["alignment"] if not alignment_endpoint: logging.info( - "Not including lane %s because no alignment set (Aggregation %d)" - % (aggregation_lane["lane"], aggregation_id) + "Not including lane %s because no alignment set (Aggregation %d)", + aggregation_lane["lane"], + aggregation_id, ) missing = True continue @@ -614,8 +628,10 @@ def setup_aggregation(self, aggregation_id): if not bamfile: logging.critical( - "No BAM alignment file for alignment %s for lane %s, skipping (Aggregation %d)" - % (alignment_endpoint, aggregation_lane["lane"], aggregation_id) + "No BAM alignment file for alignment %s for lane %s, skipping (Aggregation %d)", + alignment_endpoint, + aggregation_lane["lane"], + aggregation_id, ) missing = True continue @@ -672,23 +688,21 @@ def setup_aggregation(self, aggregation_id): env_vars[var] = value except ValueError: logging.error( - "Could not parse process variables for aggregation %d (template %d): '%s'" - % ( - aggregation_id, - self.script_template["id"], - self.script_template["process_variables"], - ) + "Could not parse process variables for aggregation %d (template %d): '%s'", + aggregation_id, + self.script_template["id"], + self.script_template["process_variables"], ) return False logging.debug( - "Environment Variables:\n%s" - % "\n".join(["\t%s=%s" % (e, env_vars[e]) for e in env_vars]) + "Environment Variables:\n%s", + "\n".join(["\t%s=%s" % (e, env_vars[e]) for e in env_vars]), ) script_file = os.path.join(aggregation_folder, self.qsub_scriptname) if self.dry_run: - logging.info("Dry run, would have created: %s" % script_file) + logging.info("Dry run, would have created: %s", script_file) return True try: @@ -700,7 +714,7 @@ def setup_aggregation(self, aggregation_id): file_record.write("\n".join(["\t".join(bamfile) for bamfile in files])) file_record.close() - logging.info("Creating script file %s" % script_file) + logging.info("Creating script file %s", script_file) script = open(script_file, "w") diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index f0611535..4083bbcd 100755 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -192,7 +192,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -204,7 +204,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = self.session.get(url) @@ -227,7 +227,7 @@ def get_align_process_info(self, alignment_id): if not process_info: logging.critical( - "Could not find processing info for alignment %d\n" % alignment_id + "Could not find processing info for alignment %d\n", alignment_id ) logging.critical(process_info) sys.exit(1) @@ -236,14 +236,14 @@ def get_align_process_info(self, alignment_id): def get_process_template(self, align_id, process_template_id): if not process_template_id: - logging.critical("No process template for alignment %d\n" % align_id) + logging.critical("No process template for alignment %d\n", align_id) return None info = self.api_single_result("process_template/%d/" % (process_template_id)) if not info: logging.critical( - "Could not find processing template for ID %d\n" % process_template_id + "Could not find processing template for ID %d\n", process_template_id ) sys.exit(1) @@ -251,11 +251,11 @@ def get_process_template(self, align_id, process_template_id): # Run alignment setup in parallel def setup_alignments(self, align_ids): - for id, error in self.pool.map(self.setup_alignment, align_ids): + for align_id, error in self.pool.map(self.setup_alignment, align_ids): if error: - logging.debug("ALN%d result received, error: %s" % (id, error)) + logging.warning("ALN%d result received, error: %s", align_id, error) else: - logging.debug("ALN%d result received, OK" % id) + logging.debug("ALN%d result received, OK", align_id) def setup_alignment(self, align_id): try: @@ -268,10 +268,10 @@ def setup_alignment(self, align_id): self.create_script(processing_info, alignment["id"]) return (align_id, None) else: - logging.info("Skipping completed alignment %d" % align_id) + logging.info("Skipping completed alignment %d", align_id) return (align_id, None) except Exception as e: - logging.exception("Could not set up alignment %d}: (%s)" % (align_id, e)) + logging.exception("Could not set up alignment %d}: (%s)", align_id, e) return (align_id, e) def get_lane_file(self, lane_id, purpose): @@ -294,14 +294,14 @@ def setup_tag(self, tag_slug): self.setup_alignments([align_tag["object_id"] for align_tag in align_tags]) def setup_project(self, project_id): - logging.info("Setting up project #%s" % project_id) + logging.info("Setting up project #%s", project_id) alignments = self.api_list_result( "flowcell_lane_alignment/?lane__sample__project=%s" % project_id ) self.setup_alignments([alignment["id"] for alignment in alignments]) def setup_flowcell(self, flowcell_label): - logging.info("Setting up flowcell for %s" % flowcell_label) + logging.info("Setting up flowcell for %s", flowcell_label) alignments = self.api_list_result( "flowcell_lane_alignment/?lane__flowcell__label=%s&page_size=1000" % flowcell_label @@ -320,7 +320,7 @@ def auto_aggregation_script(self, flowcell_label, alignments): logging.debug("Writing script to stdout") outfile = sys.stdout else: - logging.debug("Logging script to %s" % self.outfile) + logging.debug("Logging script to %s", self.outfile) outfile = open(self.outfile, "a") contents = textwrap.dedent( @@ -351,7 +351,7 @@ def add_script(self, align_id, processing_info, script_file, sample_name): logging.debug("Writing script to stdout") outfile = sys.stdout else: - logging.debug("Logging script to %s" % self.outfile) + logging.debug("Logging script to %s", self.outfile) outfile = open(self.outfile, "a") if self.simple_output: @@ -391,7 +391,7 @@ def create_script(self, processing_info, align_id): alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] if "process_template" not in alignment: - logging.error("Alignment %d has no process template" % align_id) + logging.error("Alignment %d has no process template", align_id) return False process_template = self.get_process_template( @@ -408,8 +408,9 @@ def create_script(self, processing_info, align_id): flowcell_directory = os.path.join(share_dir, "alignments") if not flowcell_directory: logging.error( - "Alignment %d has no flowcell directory for flowcell %s" - % (align_id, processing_info["flowcell"]["label"]) + "Alignment %d has no flowcell directory for flowcell %s", + align_id, + processing_info["flowcell"]["label"], ) return False @@ -443,8 +444,10 @@ def create_script(self, processing_info, align_id): if not r1_fastq: logging.error( - "Missing r1-fastq for lane %d (alignment %d) - check dir %s" - % (lane["id"], alignment["id"], fastq_directory) + "Missing r1-fastq for lane %d (alignment %d) - check dir %s", + lane["id"], + alignment["id"], + fastq_directory, ) return False @@ -452,15 +455,16 @@ def create_script(self, processing_info, align_id): r2_fastq = self.get_lane_file(lane["id"], "r2-fastq") if not r2_fastq: logging.error( - "Missing r2-fastq for lane %d (alignment %d)" - % (lane["id"], alignment["id"]) + "Missing r2-fastq for lane %d (alignment %d)", + lane["id"], + alignment["id"], ) return False script_file = os.path.join( script_directory, "%s-%s" % (alignment["sample_name"], self.qsub_scriptname) ) - logging.info("Will write to %s" % script_file) + logging.info("Will write to %s", script_file) # Set up & add environment variables env_vars = OrderedDict() @@ -505,9 +509,9 @@ def create_script(self, processing_info, align_id): p5_adapter = lane["barcode2"]["adapter5_reverse_complement"] if not p7_adapter or not p5_adapter: - logging.warn( - "Alignment %d missing adapters, some processes might not work" - % alignment["id"] + logging.warning( + "Alignment %d missing adapters, some processes might not work", + alignment["id"], ) env_vars["ADAPTER_P7"] = p7_adapter @@ -534,22 +538,20 @@ def create_script(self, processing_info, align_id): env_vars[var] = value except ValueError: logging.error( - "Could not parse process variables for align %d (template %d): '%s'" - % ( - alignment["id"], - process_template["id"], - process_template["process_variables"], - ) + "Could not parse process variables for align %d (template %d): '%s'", + alignment["id"], + process_template["id"], + process_template["process_variables"], ) return False if self.dry_run: - logging.info("Dry run, would have created: %s" % script_file) + logging.info("Dry run, would have created: %s", script_file) logging.debug(env_vars) return True if not os.path.exists(script_directory): - logging.info("Creating directory %s" % script_directory) + logging.info("Creating directory %s", script_directory) os.makedirs(script_directory) # Append to master script diff --git a/scripts/apilaneprocess.py b/scripts/apilaneprocess.py index 6b019513..626cd072 100755 --- a/scripts/apilaneprocess.py +++ b/scripts/apilaneprocess.py @@ -128,13 +128,13 @@ def get_lane_process_info(self, lane_id): if info: return info else: - logging.error("Could not find processing info for lane %d\n" % lane_id) + logging.error("Could not find processing info for lane %d\n", lane_id) sys.exit(1) def get_process_template(self, process_template_id): if not process_template_id: logging.critical( - "No process template for alignment %d\n" % self.alignment_id + "No process template for alignment %d\n", self.alignment_id ) sys.exit(1) @@ -146,7 +146,7 @@ def get_process_template(self, process_template_id): return info else: logging.error( - "Could not find processing template for ID %d\n" % process_template_id + "Could not find processing template for ID %d\n", process_template_id ) sys.exit(1) @@ -159,11 +159,11 @@ def setup_flowcell(self, flowcell_label): ) if not lanes: - logging.error("Flowcell %s has no lanes" % flowcell_label) + logging.error("Flowcell %s has no lanes", flowcell_label) return logging.debug( - "Setting up flowcell %s with %d lanes" % (flowcell_label, len(lanes)) + "Setting up flowcell %s with %d lanes", flowcell_label, len(lanes) ) self.setup_lanes([lane["id"] for lane in lanes]) @@ -181,23 +181,23 @@ def setup_tag(self, tag_slug): ) if not lane_tags: - logging.error("Tag %s has no lanes" % lane_tags) + logging.error("Tag %s has no lanes", lane_tags) - logging.debug("Setting up tag %s " % tag_slug) + logging.debug("Setting up tag %s ", tag_slug) self.setup_lanes([lane_tag["object_id"] for lane_tag in lane_tags]) def setup_lanes(self, lane_ids): - logging.debug("Setting up lane IDs %s" % str(lane_ids)) + logging.debug("Setting up lane IDs %s", lane_ids) if len(lane_ids) != len(set(lane_ids)): logging.warning( - "Duplicate lane IDs! %s " - % [ + "Duplicate lane IDs! %s ", + [ item for item, count in collections.Counter(lane_ids).items() if count > 1 - ] + ], ) # self.pool.map(self.setup_lane, lane_ids) @@ -205,7 +205,7 @@ def setup_lanes(self, lane_ids): self.setup_lane(lane_id) def setup_lane(self, lane_id): - logging.debug("Setting up lane %d" % lane_id) + logging.debug("Setting up lane %d", lane_id) processing_info = self.get_lane_process_info(lane_id) @@ -268,7 +268,7 @@ def add_script(self, script_file, lane_id, flowcell_label, sample_name): logging.debug("Writing script to stdout") outfile = sys.stdout else: - logging.debug("Logging script to %s" % self.outfile) + logging.debug("Logging script to %s", self.outfile) outfile = open(self.outfile, "a") outfile.write("cd %s && " % os.path.dirname(script_file)) @@ -291,7 +291,7 @@ def create_script(self, processing_info, pool=None): lane = processing_info["libraries"][0] if "directory" not in lane: - logging.critical("No directory for lane %d" % lane["id"]) + logging.critical("No directory for lane %d", lane["id"]) return False fastq_directory = lane["directory"] alt_dir = lane.get("project_share_directory", "") @@ -323,7 +323,7 @@ def create_script(self, processing_info, pool=None): lane["lane"], ) logging.warning( - "No alignment sample_name for lane, using %s instead" % spreadsheet_name + "No alignment sample_name for lane, using %s instead", spreadsheet_name ) if pool: @@ -334,7 +334,7 @@ def create_script(self, processing_info, pool=None): if not os.path.exists(fastq_directory): logging.critical( - "fastq directory %s does not exist, cannot continue" % fastq_directory + "fastq directory %s does not exist, cannot continue", fastq_directory ) return False @@ -343,13 +343,13 @@ def create_script(self, processing_info, pool=None): ) if self.dry_run: - logging.info("Dry run, would have created: %s" % script_file) + logging.info("Dry run, would have created: %s", script_file) return True try: outfile = open(script_file, "w") except FileNotFoundError: - logging.critical("Could not create script file %s" % script_file) + logging.critical("Could not create script file %s", script_file) return False self.add_script( diff --git a/scripts/browser/make_trackhubs_for_flowcell.py b/scripts/browser/make_trackhubs_for_flowcell.py index c44131c0..15ce9d94 100755 --- a/scripts/browser/make_trackhubs_for_flowcell.py +++ b/scripts/browser/make_trackhubs_for_flowcell.py @@ -31,9 +31,9 @@ def foldercheck(*args): if not os.path.isdir(folder): try: os.mkdir(folder) - util_log.info("Created folder: %s" % folder) + util_log.info("Created folder: %s", folder) except OSError: - util_log.error("ERROR: Could not create directory: %s" % folder) + util_log.error("ERROR: Could not create directory: %s", folder) util_log.warn( "Please make sure all nonexistant parent directories have been created." ) @@ -150,7 +150,7 @@ def __init__( if len(self.projects) == 1 and project_dir: self.project_dirs[project] = project_dir - logging.info("Using project dir: %s" % self.project_dirs[project]) + logging.info("Using project dir: %s", self.project_dirs[project]) else: for project in self.projects: self.project_dirs[project] = os.path.join( @@ -186,7 +186,7 @@ def load(self): if not self.flowcell_date: self.flowcell_date = match.groups()[1] - logging.info("FLOWCELL DATE: %s" % self.flowcell_date) + logging.info("FLOWCELL DATE: %s", self.flowcell_date) self.main_label = "%s%son%s" % ( self.project, @@ -194,23 +194,22 @@ def load(self): self.flowcell_date, ) - logging.info("Main track name: %s" % self.main_label) + logging.info("Main track name: %s", self.main_label) self.excludes_file = os.path.join(self.outdir, "excludes.%s" % self.main_label) if self.flowcell_link_folder: logging.debug( - "link folder: " - + self.flowcell_link_folder - + " base folder: " - + self.basedir_name + "link folder: %s base_folder: %s", + self.flowcell_link_folder, + self.basedir_name, ) self.link_dir = os.path.join(self.flowcell_link_folder, self.basedir_name) else: self.link_dir = "" self.prepare_tracks() - logging.info("Main label: %s" % self.main_label) + logging.info("Main label: %s", self.main_label) # LIMS records early mm10 alignments as 'mm10-encode3-male' # change it back to just 'mm10' @@ -232,7 +231,7 @@ def load(self): # function for creating hub.txt def create_hubtxt(self): hubfile = os.path.join(self.outdir, "hub.txt") - logging.info("Creating hub.txt file: %s" % hubfile) + logging.info("Creating hub.txt file: %s", hubfile) hub = open(hubfile, "w") hub.write("hub %s\n" % self.flowcell_name) hub.write("shortLabel %s\n" % self.flowcell_name) @@ -245,7 +244,7 @@ def create_hubtxt(self): # function for creating genome.txt def create_genomestxt(self): genomefile = os.path.join(self.outdir, "genomes.txt") - logging.info("Creating genome.txt file: %s" % genomefile) + logging.info("Creating genome.txt file: %s", genomefile) genomes = open(genomefile, "w") for hgdb in self.subtrack_sets.keys(): genomes.write("\ngenome %s\n" % hgdb) @@ -260,9 +259,9 @@ def prepare_tracks(self): self.tracks = [] for lane in self.data: - logging.debug("preparing tracks for lane: " + str(lane)) + logging.debug("preparing tracks for lane: %s", lane) if "hgdb" not in lane: - logging.error("Not using lane %s: no hgdb value" % lane) + logging.error("Not using lane %s: no hgdb value", lane) continue if lane["Index"] == "": @@ -299,8 +298,8 @@ def prepare_tracks(self): "%sden%s" % (self.main_label, trackname_suffix) ) - logging.debug("tag track name: " + track["tagtrackname"]) - logging.debug("den track name: " + track["dentrackname"]) + logging.debug("tag track name: %s", track["tagtrackname"]) + logging.debug("den track name: %s", track["dentrackname"]) project = track["SampleProject"] @@ -372,26 +371,26 @@ def prepare_tracks(self): track["hasTags"] = True if not track["hasDensities"] or not track["hasTags"]: - logging.error("%s does not have all files" % track["SampleID"]) + logging.error("%s does not have all files", track["SampleID"]) if not track["hasDensities"]: logging.error("Missing densities") if self.bigwig: logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["bigwigfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["bigwigfilename"]), ) else: logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["wigfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["wigfilename"]), ) if not track["hasTags"]: logging.error("Missing tags") logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["bamfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["bamfilename"]), ) - logging.info("%s" % str(track)) + logging.info("%s", str(track)) if track["hasDensities"] or track["hasTags"]: self.subtrack_sets[hgdb].append(track) @@ -450,7 +449,7 @@ def create_ras(self): # write RA / track file def create_ra(self, hgdb): - logging.info("CREATING RA FOR %s" % hgdb) + logging.info("CREATING RA FOR %s", hgdb) subtracks = self.subtrack_sets[hgdb] foldercheck(os.path.join(self.outdir, hgdb)) @@ -502,15 +501,15 @@ def create_ra(self, hgdb): for subtrack in subtracks: if "wellmapping-no-mito" not in subtrack: - logging.warn( - "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] + logging.warning( + "%s has no wellmapping-no-mito count", subtrack["dentrackname"] ) subtrack["wellmapping-no-mito"] = "N/A" if "wellmapping" not in subtrack: - logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) + logging.warning("%s has no wellmapping count", subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" if "SPOT" not in subtrack: - logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) + logging.warning("%s has no SPOT score", subtrack["dentrackname"]) subtrack["SPOT"] = "N/A" for subtrack in subtracks: @@ -665,8 +664,8 @@ def get_counts_for_alignment(self, alignment): # Check to see if we got all the types we wanted for count in self.count_types: if count not in counts: - logging.warn( - "Could not fetch count %s for alignment: %s" % (count, alignment) + logging.warning( + "Could not fetch count %s for alignment: %s", (count, alignment) ) return counts @@ -674,7 +673,7 @@ def get_counts_for_alignment(self, alignment): def get_rna_metrics_for_alignment(self, alignment): results = self.get("rna_alignment_metrics/?alignment=%s" % alignment) if not results["results"]: - logging.warn("Could not fetch RNA metrics for alignment: %s" % alignment) + logging.warning("Could not fetch RNA metrics for alignment: %s", alignment) return None return results["results"][0] @@ -692,7 +691,7 @@ def get_spot_for_alignment(self, alignment): def get_alignment_data(library, alignment, lims): # This is mainly a shim. - logging.debug("Fetching data for library: %s" % library) + logging.debug("Fetching data for library: %s", library) d = dict() d["project"] = library["project"] @@ -713,7 +712,7 @@ def get_alignment_data(library, alignment, lims): d["failed_lane"] = lims_lane["failed"] if d["failed_lane"]: - logging.warn("Lane marked as failed, not using: %s" % library["id"]) + logging.warning("Lane marked as failed, not using: %s", library["id"]) return d if d["aligner"] == "bwa": @@ -800,7 +799,7 @@ def main(args=sys.argv): for project in load_groups.keys(): lane_group = load_groups[project] - logging.info("the basedirectory is: %s" % basedir) + logging.info("the basedirectory is: %s", basedir) outdir = os.path.join(basedir, "browser-load-%s" % project) loader = MakeBrowserload( lane_group, diff --git a/scripts/browser/make_trackhubs_for_projects.py b/scripts/browser/make_trackhubs_for_projects.py index b5dae3f6..477c8be5 100755 --- a/scripts/browser/make_trackhubs_for_projects.py +++ b/scripts/browser/make_trackhubs_for_projects.py @@ -162,9 +162,9 @@ def load_config(self, trackhubconfig): def load(self): # set up folder - logging.info("Checking for trackhub folder: %s" % self.aggregation_link_folder) + logging.info("Checking for trackhub folder: %s", self.aggregation_link_folder) foldercheck(self.aggregation_link_folder) - logging.info("Checking for project trackhub folder: %s" % self.outdir) + logging.info("Checking for project trackhub folder: %s", self.outdir) foldercheck(self.outdir) # prepare tracks for writing @@ -178,7 +178,7 @@ def load(self): def create_hubtxt(self): hubfile = os.path.join(self.outdir, "hub.txt") - logging.info("Creating hub.txt file: %s" % hubfile) + logging.info("Creating hub.txt file: %s", hubfile) hub = open(hubfile, "w") hub.write("hub %s\n" % self.projectname) hub.write("shortLabel %s\n" % self.projectname) @@ -189,7 +189,7 @@ def create_hubtxt(self): def create_genometxt(self): genomefile = os.path.join(self.outdir, "genomes.txt") - logging.info("Creating genome.txt file: %s" % genomefile) + logging.info("Creating genome.txt file: %s", genomefile) genomes = open(genomefile, "w") for key in self.all_tracks: genomes.write("\ngenome %s\n" % key) @@ -207,7 +207,7 @@ def prepare_tracks(self): continue tracks = {} - logging.debug("Preparing tracks for AGG: %s" % agg["id"]) + logging.debug("Preparing tracks for AGG: %s", agg["id"]) tracks["agg_id"] = agg["id"] tracks["agg_ln"] = agg["library_name"] tracks["agg_taxonomy"] = agg["taxonomy_name"] @@ -249,7 +249,7 @@ def prepare_tracks(self): tracks["dnase_align"] = agg["files"]["all-alignments-bam"] tracks["dnase_cutconts"] = agg["files"]["cutcounts-bw"] else: - logging.info("Unable to locate AGG files for: %s" % (agg["id"])) + logging.info("Unable to locate AGG files for: %s", (agg["id"])) # rna (processes are seperate for each genome) elif ( agg["aggregation_process_template_id"] == 30 @@ -266,19 +266,20 @@ def prepare_tracks(self): tracks["rna_poscov"] = agg["files"]["pos-coverage-bigwig"] tracks["rna_negcov"] = agg["files"]["neg-coverage-bigwig"] else: - logging.info("Unable to locate AGG files for: %s" % (agg["id"])) + logging.info("Unable to locate AGG files for: %s", (agg["id"])) # coverage across both strands still new, seperate from the rest for now if "all-coverage-bigwig" in agg["files"]: tracks["rna_bothcov"] = agg["files"]["all-coverage-bigwig"] else: logging.info( - "Unable to locate combined stranded AGG files for: %s" - % (agg["id"]) + "Unable to locate combined stranded AGG files for: %s", + (agg["id"]), ) else: logging.info( - "Unknown template type, %s, for %s" - % (agg["aggregation_process_template_id"], agg["id"]) + "Unknown template type, %s, for %s", + agg["aggregation_process_template_id"], + agg["id"], ) if tracks["agg_genome"] not in self.all_tracks: self.all_tracks[tracks["agg_genome"]] = [] @@ -299,7 +300,7 @@ def create_ras(self): self.create_ra(key) def create_ra(self, genome): - logging.info("Creating RA file for genome, %s" % genome) + logging.info("Creating RA file for genome, %s", genome) subtracks = self.all_tracks[genome] diff --git a/scripts/browser/old_native_fc_loading/make_browser_load.py b/scripts/browser/old_native_fc_loading/make_browser_load.py index 5a1574ad..ff6202da 100755 --- a/scripts/browser/old_native_fc_loading/make_browser_load.py +++ b/scripts/browser/old_native_fc_loading/make_browser_load.py @@ -144,7 +144,7 @@ def __init__( if len(self.projects) == 1 and project_dir: self.project_dirs[project] = project_dir - logging.info("Using project dir: %s" % self.project_dirs[project]) + logging.info("Using project dir: %s", self.project_dirs[project]) else: for project in self.projects: self.project_dirs[project] = os.path.join( @@ -191,7 +191,7 @@ def load(self): if not self.flowcell_date: self.flowcell_date = match.groups()[1] - logging.info("FLOWCELL DATE: %s" % self.flowcell_date) + logging.info("FLOWCELL DATE: %s", self.flowcell_date) self.main_label = "%s%son%s" % ( self.file_label, @@ -199,23 +199,22 @@ def load(self): self.flowcell_date, ) - logging.info("Main track name: %s" % self.main_label) + logging.info("Main track name: %s", self.main_label) self.excludes_file = os.path.join(self.outdir, "excludes.%s" % self.main_label) if self.flowcell_link_folder: logging.debug( - "link folder: " - + self.flowcell_link_folder - + " base folder: " - + self.basedir_name + "link folder: %s base_folder: %s", + self.flowcell_link_folder, + self.basedir_name, ) self.link_dir = os.path.join(self.flowcell_link_folder, self.basedir_name) else: self.link_dir = "" self.prepare_tracks() - logging.info("Main label: %s" % self.main_label) + logging.info("Main label: %s", self.main_label) self.create_ras() self.create_htmls() @@ -229,9 +228,9 @@ def prepare_tracks(self): self.tracks = [] for lane in self.data: - logging.debug("preparing tracks for lane: " + str(lane)) + logging.debug("preparing tracks for lane: %s", str(lane)) if "hgdb" not in lane: - logging.error("Not using lane %s: no hgdb value" % lane) + logging.error("Not using lane %s: no hgdb value", lane) continue if lane["Index"] == "": @@ -268,8 +267,8 @@ def prepare_tracks(self): "%sden%s" % (self.main_label, trackname_suffix) ) - logging.debug("tag track name: " + track["tagtrackname"]) - logging.debug("den track name: " + track["dentrackname"]) + logging.debug("tag track name: %s", track["tagtrackname"]) + logging.debug("den track name: %s", track["dentrackname"]) project = track["SampleProject"] @@ -341,26 +340,26 @@ def prepare_tracks(self): track["hasTags"] = True if not track["hasDensities"] or not track["hasTags"]: - logging.error("%s does not have all files" % track["SampleID"]) + logging.error("%s does not have all files", track["SampleID"]) if not track["hasDensities"]: logging.error("Missing densities") if self.bigwig: logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["bigwigfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["bigwigfilename"]), ) else: logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["wigfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["wigfilename"]), ) if not track["hasTags"]: logging.error("Missing tags") logging.error( - "Wanted: " - + os.path.join(track["sampleDir"], track["bamfilename"]) + "Wanted: %s", + os.path.join(track["sampleDir"], track["bamfilename"]), ) - logging.info("%s" % str(track)) + logging.info("%s", str(track)) if track["hasDensities"] or track["hasTags"]: self.subtrack_sets[hgdb].append(track) @@ -421,7 +420,7 @@ def create_ras(self): def create_commands(self): makefile = os.path.join(self.outdir, "make.%s.doc" % self.main_label) - logging.info("Makefile: %s" % makefile) + logging.info("Makefile: %s", makefile) commands = open(makefile, "w") commands.write("# %s\n" % makefile) @@ -510,7 +509,7 @@ def create_subtrack_commands(self, subtrack, commandsout): def create_genome_commands(self, hgdb, commandsout): if hgdb not in self.genome_organisms: - logging.error(hgdb + " not in " + str(self.genome_organisms)) + logging.error("%s not in %s", hgdb, self.genome_organisms) commandsout.write("\n ERROR: no " + hgdb + " genome\n") return @@ -554,14 +553,14 @@ def create_excludes(self): for subtrack in self.tracks: for suffix in ["frm", "MYD", "MYI"]: - logging.debug("subtrack contents: " + str(subtrack)) + logging.debug("subtrack contents: %s", str(subtrack)) excludes.write("%s.%s\n" % (subtrack["tagtrackname"], suffix)) excludes.write("%s.%s\n" % (subtrack["dentrackname"], suffix)) excludes.close() def create_ra(self, hgdb): - logging.info("CREATING RA FOR %s" % hgdb) + logging.info("CREATING RA FOR %s", hgdb) subtracks = self.subtrack_sets[hgdb] foldercheck(os.path.join(self.outdir, hgdb)) @@ -613,15 +612,15 @@ def create_ra(self, hgdb): for subtrack in subtracks: if "wellmapping-no-mito" not in subtrack: - logging.warn( - "%s has no wellmapping-no-mito count" % subtrack["dentrackname"] + logging.warning( + "%s has no wellmapping-no-mito count", subtrack["dentrackname"] ) subtrack["wellmapping-no-mito"] = "N/A" if "wellmapping" not in subtrack: - logging.warn("%s has no wellmapping count" % subtrack["dentrackname"]) + logging.warning("%s has no wellmapping count", subtrack["dentrackname"]) subtrack["wellmapping"] = "N/A" if "SPOT" not in subtrack: - logging.warn("%s has no SPOT score" % subtrack["dentrackname"]) + logging.warning("%s has no SPOT score", subtrack["dentrackname"]) subtrack["SPOT"] = "N/A" # track STAM_FC630D3_110711_IT_TAG_L5_DS18900_36_ @@ -779,8 +778,8 @@ def get_counts_for_alignment(self, alignment): # Check to see if we got all the types we wanted for count in self.count_types: if count not in counts: - logging.warn( - "Could not fetch count %s for alignment: %s" % (count, alignment) + logging.warning( + "Could not fetch count %s for alignment: %s", (count, alignment) ) return counts @@ -788,7 +787,7 @@ def get_counts_for_alignment(self, alignment): def get_rna_metrics_for_alignment(self, alignment): results = self.get("rna_alignment_metrics/?alignment=%s" % alignment) if not results["results"]: - logging.warn("Could not fetch RNA metrics for alignment: %s" % alignment) + logging.warning("Could not fetch RNA metrics for alignment: %s", alignment) return None return results["results"][0] @@ -806,7 +805,7 @@ def get_spot_for_alignment(self, alignment): def get_alignment_data(library, alignment, lims): # This is mainly a shim. - logging.debug("Fetching data for library: %s" % library) + logging.debug("Fetching data for library: %s", library) d = dict() d["project"] = library["project"] @@ -828,7 +827,7 @@ def get_alignment_data(library, alignment, lims): d["failed_lane"] = lims_lane["failed"] if d["failed_lane"]: - logging.warn("Lane marked as failed, not using: %s" % library["id"]) + logging.warning("Lane marked as failed, not using: %s", library["id"]) return d if d["aligner"] == "bwa": @@ -920,12 +919,14 @@ def main(args=sys.argv): if not os.path.isfile(browserconfig): logging.error( - "No configuration file '%s' exists, don't know how to load project %s into browser %s" - % (browserconfig, project, browser) + "No configuration file '%s' exists, don't know how to load project %s into browser %s", + browserconfig, + project, + browser, ) sys.exit(1) - logging.info("Reading browser configuration from %s" % browserconfig) + logging.info("Reading browser configuration from %s", browserconfig) outdir = os.path.join(basedir, "browser-load-%s-%s" % (project, browser)) diff --git a/scripts/cluster/monitor_alignments.py b/scripts/cluster/monitor_alignments.py index 78f6c9a3..18a6ea4b 100755 --- a/scripts/cluster/monitor_alignments.py +++ b/scripts/cluster/monitor_alignments.py @@ -92,7 +92,7 @@ def parse_jobnames(self): if match: alignments.add(int(match.group(1))) - log.info("Alignment IDs: %s" % alignments) + log.info("Alignment IDs: %s", alignments) return alignments def get_host_info(self): @@ -111,7 +111,7 @@ def parse_jobnames(self): match = ALIGN_REGEX.search(jobname) if match: alignments.add(int(match.group(1))) - log.info("Alignment IDs: %s" % alignments) + log.info("Alignment IDs: %s", alignments) return alignments def get_host_info(self): @@ -145,7 +145,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = requests.get(url, headers=self.headers) @@ -168,7 +168,7 @@ def run(self): # What alignments are currently running but not marked yet? mark_alignments = currently_running - marked_running if mark_alignments: - log.info("Marking alignments as processing: %s" % str(mark_alignments)) + log.info("Marking alignments as processing: %s", (mark_alignments)) [ self.update_processing_status(align_id, True) for align_id in mark_alignments @@ -177,7 +177,7 @@ def run(self): # What alignments are currently marked but not running? finished_alignments = marked_running - currently_running if finished_alignments: - log.info("Alignments no longer processing: %s" % str(finished_alignments)) + log.info("Alignments no longer processing: %s", (finished_alignments)) [ self.update_processing_status(align_id, False) for align_id in finished_alignments @@ -194,7 +194,7 @@ def update_processing_status(self, align_id, processing=True): if not update_result.ok: log.critical( - "Could not update alignment %d: %s" % (align_id, str(update_result)) + "Could not update alignment %d: %s", (align_id, str(update_result)) ) return False @@ -213,7 +213,7 @@ def lims_currently_processing(self): for result in fetch_results: lims_process_align_ids.add(result["id"]) log.info( - "Currently marked as processing on LIMS: %s" % str(lims_process_align_ids) + "Currently marked as processing on LIMS: %s", str(lims_process_align_ids) ) return lims_process_align_ids @@ -225,7 +225,7 @@ def update_host_info(self): url = "%s/key_value/?key=%s" % (self.api_url, key) key_value = self.get_single_result(url) if not key_value: - log.error("Cannot find '%s' key value" % key) + log.error("Cannot find '%s' key value", key) return update = requests.patch( @@ -235,7 +235,7 @@ def update_host_info(self): if update.ok: log.info(update.json()) else: - log.error("Could not update %s usage." % host) + log.error("Could not update %s usage.", host) log.error(update.text) def get_single_result(self, fetch_url, field=None): @@ -248,19 +248,17 @@ def get_single_result(self, fetch_url, field=None): if fetch_results.ok: results = fetch_results.json() if results["count"] > 1: - log.error("More than one matching item for fetch query: %s" % fetch_url) + log.error("More than one matching item for fetch query: %s", fetch_url) elif results["count"] == 0: - log.debug("No matching items for fetch query: %s" % fetch_url) + log.debug("No matching items for fetch query: %s", fetch_url) else: result = results["results"][0] - log.debug( - "Single result fetched from %s: %s" % (fetch_url, str(result)) - ) + log.debug("Single result fetched from %s: %s", fetch_url, (result)) if field: return result[field] return result else: - log.error("Could not execute api query: %s" % fetch_url) + log.error("Could not execute api query: %s", fetch_url) return None @@ -285,10 +283,10 @@ def main(args=sys.argv): if not poptions.base_api_url and "LIMS_API_URL" in os.environ: api_url = os.environ["LIMS_API_URL"] - log.debug("Using LIMS API endpoint: %s from environment" % api_url) + log.debug("Using LIMS API endpoint: %s from environment", api_url) elif poptions.base_api_url: api_url = poptions.base_api_url - log.debug("Using LIMS API endpoint: %s from options" % api_url) + log.debug("Using LIMS API endpoint: %s from options", api_url) else: sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) diff --git a/scripts/copy_notify.py b/scripts/copy_notify.py index 4e9c32e5..38303fa5 100755 --- a/scripts/copy_notify.py +++ b/scripts/copy_notify.py @@ -107,10 +107,10 @@ def get_folder_reads(sequencer_folder): runinfodoc = minidom.parse(runinfo_file) return len(runinfodoc.getElementsByTagName("Read")) except IOError: - logging.info("Could not read %s" % runinfo_file) + logging.info("Could not read %s", runinfo_file) return None except xml.parsers.expat.ExpatError: - logging.info("%s is malformatted" % runinfo_file) + logging.info("%s is malformatted", runinfo_file) return None @@ -134,27 +134,28 @@ def load_folders(): if flowcell_reads[sequencer_folder]: logging.info( - "Initial state of %s: %s" - % (sequencer_folder, str(check_folders[sequencer_folder])) + "Initial state of %s: %s", + sequencer_folder, + check_folders[sequencer_folder], ) else: - logging.info("Initial state of %s: does not have reads" % sequencer_folder) + logging.info("Initial state of %s: does not have reads", sequencer_folder) def check_folder(sequencer_folder): """Check a sequencer folder and notify for changes""" - logging.debug("Checking folder: %s" % sequencer_folder) + logging.debug("Checking folder: %s", sequencer_folder) if sequencer_folder in check_folders: if not check_folders[sequencer_folder] and check_copy(sequencer_folder): - logging.info("Folder finished copying: %s" % sequencer_folder) + logging.info("Folder finished copying: %s", sequencer_folder) notify_copy(sequencer_folder) check_folders[sequencer_folder] = True else: - logging.info("New folder: %s" % sequencer_folder) + logging.info("New folder: %s", sequencer_folder) check_folders[sequencer_folder] = False flowcell_reads[sequencer_folder] = get_folder_reads(sequencer_folder) - logging.debug("Number of reads: %s" % str(flowcell_reads[sequencer_folder])) + logging.debug("Number of reads: %s", (flowcell_reads[sequencer_folder])) notify_new(sequencer_folder) @@ -171,7 +172,7 @@ def run_check(): # delete folders that don't exist anymore from checking for sequencer_folder in folders: if not os.path.exists(sequencer_folder): - logging.info("Deleting folder: %s" % sequencer_folder) + logging.info("Deleting folder: %s", sequencer_folder) del check_folders[sequencer_folder] # check each folder for being new or being copied @@ -237,6 +238,6 @@ def send_email(subject, body, emails): while True: # because somebody running the script already has checked the state of things beforehand, # wait period goes first - logging.info("Waiting %d seconds before next check" % wait) + logging.info("Waiting %d seconds before next check", wait) time.sleep(wait) run_check() diff --git a/scripts/create_processing.py b/scripts/create_processing.py index a76d2e67..bfa56910 100755 --- a/scripts/create_processing.py +++ b/scripts/create_processing.py @@ -205,27 +205,25 @@ def include_lane(self, lane): if self.ignore_failed_lanes and lane["failed"]: logging.debug( - "Skipping %s, failed and we are ignoring failed lanes" - % lane["samplesheet_name"] + "Skipping %s, failed and we are ignoring failed lanes", + lane["samplesheet_name"], ) return False if self.project_filter and lane["project"] not in self.project_filter: logging.debug( - "Skipping %s, not in project filter" % lane["samplesheet_name"] + "Skipping %s, not in project filter", lane["samplesheet_name"] ) return False if self.library_filter and lane["library"] not in self.library_filter: logging.debug( - "Skipping %s, not in library filter" % lane["samplesheet_name"] + "Skipping %s, not in library filter", lane["samplesheet_name"] ) return False if self.sample_filter and lane["sample"] not in self.sample_filter: - logging.debug( - "Skipping %s, not in sample filter" % lane["samplesheet_name"] - ) + logging.debug("Skipping %s, not in sample filter", lane["samplesheet_name"]) return False if ( @@ -234,7 +232,7 @@ def include_lane(self, lane): and lane["alignments"][0]["id"] not in self.alignment_filter ): logging.debug( - "Skipping %s, not in alignment filter" % lane["samplesheet_name"] + "Skipping %s, not in alignment filter", lane["samplesheet_name"] ) return False @@ -289,11 +287,11 @@ def get_script_template(self, lane): alignment = lane["alignments"][0] if not alignment["aligner"]: - logging.info("# FastQC only %s" % lane["sample"]) + logging.info("# FastQC only %s", lane["sample"]) base_script = "fastqc" else: base_script = alignment["aligner"] - logging.info("# Aligning %s with %s" % (lane["sample"], base_script)) + logging.info("# Aligning %s with %s", lane["sample"], base_script) if base_script not in script_contents: script_contents[base_script] = open(script_files[base_script], "r").read() @@ -307,7 +305,7 @@ def create_flowcell_script(self, inscript): ) if not os.path.exists(script_directory): - logging.info("Creating directory %s" % script_directory) + logging.info("Creating directory %s", script_directory) os.makedirs(script_directory) script_file = os.path.join(script_directory, os.path.basename(inscript)) @@ -367,7 +365,7 @@ def create_script(self, lane): script_directory = "%s/%s" % (fastq_directory, align_dir) if not os.path.exists(script_directory): - logging.info("Creating directory %s" % script_directory) + logging.info("Creating directory %s", script_directory) os.makedirs(script_directory) script_file = os.path.join( diff --git a/scripts/flowcells/barcode_masks.py b/scripts/flowcells/barcode_masks.py index 2dbaddeb..6076e598 100755 --- a/scripts/flowcells/barcode_masks.py +++ b/scripts/flowcells/barcode_masks.py @@ -108,9 +108,7 @@ def detect_collisions(json_data): for x, barcode in enumerate(barcodes) if barcode == barcodes[x - 1] ] - logging.error( - "Collision on lane {}. Barcode(s): {}\n".format(i + 1, collision) - ) + logging.error("Collision on lane %d. Barcode(s): %s\n", i + 1, collision) sys.exit(1) return True return False diff --git a/scripts/flowcells/demux_fastq.py b/scripts/flowcells/demux_fastq.py index 452a8062..e020e219 100755 --- a/scripts/flowcells/demux_fastq.py +++ b/scripts/flowcells/demux_fastq.py @@ -148,7 +148,7 @@ def parse_processing_file( elif run_type.startswith("Novaseq 6000"): lane_libraries = [lib for lib in data["libraries"] if lib["lane"] == lane] else: - logging.warn("Run type %s not supported; using all libraries" % run_type) + logging.warning("Run type %s not supported; using all libraries", run_type) lane_libraries = data["libraries"] for library in lane_libraries: @@ -162,7 +162,7 @@ def parse_processing_file( ) if ignore_failed_lanes and library["failed"]: - logging.info("Ignoring failed library %s" % label) + logging.info("Ignoring failed library %s", label) continue project_dir = "Project_%s" % library["project"] @@ -195,8 +195,10 @@ def parse_processing_file( # TODO: This can be smarter if barcode in barcodes: logging.error( - "Barcode %s already taken, lower --mismatches! (taken by %s+%s)" - % (barcode, barcode1, barcode2) + "Barcode %s already taken, lower --mismatches! (taken by %s+%s)", + barcode, + barcode1, + barcode2, ) sys.exit(1) barcodes[barcode] = label @@ -210,7 +212,7 @@ def parse_processing_file( ) logging.info( - "Mapping %d barcodes to %s libraries" % (len(barcodes), len(lane_libraries)) + "Mapping %d barcodes to %s libraries", len(barcodes), len(lane_libraries) ) logging.debug(barcodes) @@ -233,7 +235,7 @@ def split_file(filename, barcodes, labels): ) tally = 0 - logging.info("Demultiplexing file: %s" % filename) + logging.info("Demultiplexing file: %s", filename) if filename.endswith(".gz"): parsein = subprocess.Popen( @@ -249,9 +251,9 @@ def split_file(filename, barcodes, labels): match = barcode_re.search(record) if not match: - logging.error("Could not match %s" % record) + logging.error("Could not match %s", record) logging.error(str(seq)) - logging.error("Record %d in %s" % (tally, filename)) + logging.error("Record %d in %s", tally, filename) sys.exit(1) matches = match.groups() @@ -301,13 +303,13 @@ def main(argv): global lengths lengths = set([]) - logging.info("File(s): %s" % args.infile) - logging.info("OutDir: %s" % args.outdir) - logging.info("JSON: %s" % args.processing_file) + logging.info("File(s): %s", args.infile) + logging.info("OutDir: %s", args.outdir) + logging.info("JSON: %s", args.processing_file) if args.autosuffix: args.suffix = guess_suffix(args.infile[0]) - logging.info("--autosuffix, guessing suffix as %s" % args.suffix) + logging.info("--autosuffix, guessing suffix as %s", args.suffix) barcodes, labels = parse_processing_file( args.processing_file, diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index 78bccf74..de1d1b68 100755 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -144,7 +144,7 @@ def create_links( rel_path = os.path.relpath(input_file, output_dir) - logging.info("Linking %s => %s" % (rel_path, output_file)) + logging.info("Linking %s => %s", rel_path, output_file) if not dry_run and not os.path.exists(output_file): os.symlink(rel_path, output_file) diff --git a/scripts/laneprocess.py b/scripts/laneprocess.py index 94314a38..63e9ed5a 100755 --- a/scripts/laneprocess.py +++ b/scripts/laneprocess.py @@ -15,7 +15,7 @@ log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -logging.warn("This script is deprecated - consider using apilaneprocess.py instead!") +logging.warning("This script is deprecated - consider using apilaneprocess.py instead!") STAMPIPES = os.getenv("STAMPIPES", "~/stampipes") @@ -144,7 +144,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -156,7 +156,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = self.session.get(url) @@ -181,14 +181,14 @@ def get_lane_process_info(self, lane_id): logging.debug(info.json()) return info.json() else: - logging.error("Could not find processing info for lane %d\n" % lane_id) + logging.error("Could not find processing info for lane %d\n", lane_id) logging.error(info) sys.exit(1) def get_process_template(self, process_template_id): if not process_template_id: logging.critical( - "No process template for alignment %d\n" % self.alignment_id + "No process template for alignment %d\n", self.alignment_id ) sys.exit(1) @@ -201,7 +201,7 @@ def get_process_template(self, process_template_id): return info.json() else: logging.error( - "Could not find processing template for ID %d\n" % process_template_id + "Could not find processing template for ID %d\n", process_template_id ) sys.exit(1) @@ -232,7 +232,7 @@ def add_script(self, script_file, lane_id, flowcell_label, sample_name): logging.debug("Writing script to stdout") outfile = sys.stdout else: - logging.debug("Logging script to %s" % self.outfile) + logging.debug("Logging script to %s", self.outfile) outfile = open(self.outfile, "a") outfile.write("cd %s && " % os.path.dirname(script_file)) @@ -255,7 +255,7 @@ def create_script(self, processing_info): lane = processing_info["libraries"][0] if "directory" not in lane: - logging.critical("No directory for lane %d" % lane["id"]) + logging.critical("No directory for lane %d", lane["id"]) return False fastq_directory = lane["directory"] @@ -273,12 +273,12 @@ def create_script(self, processing_info): lane["lane"], ) logging.warning( - "No alignment sample_name for lane, using %s instead" % spreadsheet_name + "No alignment sample_name for lane, using %s instead", spreadsheet_name ) if not os.path.exists(fastq_directory): logging.critical( - "fastq directory %s does not exist, cannot continue" % fastq_directory + "fastq directory %s does not exist, cannot continue", fastq_directory ) return False @@ -287,13 +287,13 @@ def create_script(self, processing_info): ) if self.dry_run: - logging.info("Dry run, would have created: %s" % script_file) + logging.info("Dry run, would have created: %s", script_file) return True try: outfile = open(script_file, "w") except FileNotFoundError: - logging.critical("Could not create script file %s" % script_file) + logging.critical("Could not create script file %s", script_file) return False self.add_script( diff --git a/scripts/lims/aggregation/get_files.py b/scripts/lims/aggregation/get_files.py index 3ee2ea2d..2df0eb07 100755 --- a/scripts/lims/aggregation/get_files.py +++ b/scripts/lims/aggregation/get_files.py @@ -86,7 +86,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -98,7 +98,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = requests.get(url, headers=self.headers) @@ -127,17 +127,17 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): if fetch_results.ok: results = fetch_results.json() if results["count"] > 1: - log.error("More than one matching item for fetch query: %s" % url) + log.error("More than one matching item for fetch query: %s", url) elif results["count"] == 0: - log.debug("No matching items for fetch query: %s" % url) + log.debug("No matching items for fetch query: %s", url) else: result = results["results"][0] - log.debug("Single result fetched from %s: %s" % (url, str(result))) + log.debug("Single result fetched from %s: %s", url, result) if field: return result[field] return result else: - log.error("Could not execute api query: %s" % url) + log.error("Could not execute api query: %s", url) return None @@ -155,7 +155,7 @@ def retrieve_file(self, aggregation_id, file_purpose): aggregation = self.api_single_result("aggregation/%d" % aggregation_id) if not aggregation: - logging.critical("Cannot find aggregation %d" % aggregation_id) + logging.critical("Cannot find aggregation %d", aggregation_id) sys.exit(1) logging.debug(aggregation) @@ -171,15 +171,19 @@ def retrieve_file(self, aggregation_id, file_purpose): if len(files) > 1: logging.critical( - "%d %s files found for aggregation %d" - % (len(files), file_purpose["slug"], aggregation_id) + "%d %s files found for aggregation %d", + len(files), + file_purpose["slug"], + aggregation_id, ) sys.exit(1) if not files: logging.critical( - "%d %s files found for aggregation %d" - % (len(files), file_purpose["slug"], aggregation_id) + "%d %s files found for aggregation %d", + len(files), + file_purpose["slug"], + aggregation_id, ) sys.exit(1) @@ -196,7 +200,7 @@ def retrieve_library_file(self, library_number, file_purpose): library = self.api_single_list_result("library/?number=%d" % library_number) if not library: - logging.critical("Could not find library %d" % library_number) + logging.critical("Could not find library %d", library_number) sys.exit(1) logging.debug(library) @@ -207,22 +211,22 @@ def retrieve_library_file(self, library_number, file_purpose): use_aggregation = self.find_single_aggregation(aggregations) if not use_aggregation: logging.critical( - "More than one aggregation for library %d and no default found, must specify aggregation id" - % (library_number) + "More than one aggregation for library %d and no default found, must specify aggregation id", + library_number, ) logging.critical( - "Options: " - + ", ".join([aggregation["id"] for aggregation in aggregations]) + "Options: %s", + ", ".join([aggregation["id"] for aggregation in aggregations]), ) return else: - logging.warn( - "More than one aggregation for library %d, using default" - % (library_number) + logging.warning( + "More than one aggregation for library %d, using default", + library_number, ) elif len(aggregations) == 0: logging.critical( - "Cannot find aggregations for library %d" % (library_number) + "Cannot find aggregations for library %d", (library_number) ) return elif len(aggregations) == 1: @@ -234,7 +238,7 @@ def retrieve(self, aggregation_id, library_number, file_purpose_slug): file_purpose = self.get_file_purpose(file_purpose_slug) if not file_purpose: - logging.critical("Cannot find file purpose %s" % file_purpose_slug) + logging.critical("Cannot find file purpose %s", file_purpose_slug) sys.exit(1) if aggregation_id: @@ -263,10 +267,10 @@ def main(args=sys.argv): if not poptions.base_api_url and "LIMS_API_URL" in os.environ: api_url = os.environ["LIMS_API_URL"] - log.debug("Using LIMS API endpoint: %s from environment" % api_url) + log.debug("Using LIMS API endpoint: %s from environment", api_url) elif poptions.base_api_url: api_url = poptions.base_api_url - log.debug("Using LIMS API endpoint: %s from options" % api_url) + log.debug("Using LIMS API endpoint: %s from options", api_url) else: sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) @@ -284,7 +288,7 @@ def main(args=sys.argv): library_number = int(poptions.library_number.strip(string.ascii_letters)) except ValueError: logging.critical( - "Could not get library number from %s" % poptions.library_number + "Could not get library number from %s", poptions.library_number ) sys.exit() else: diff --git a/scripts/lims/alignment/get_files.py b/scripts/lims/alignment/get_files.py index c96e323b..1be8e9e9 100755 --- a/scripts/lims/alignment/get_files.py +++ b/scripts/lims/alignment/get_files.py @@ -95,7 +95,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -107,7 +107,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = requests.get(url, headers=self.headers) @@ -136,17 +136,17 @@ def api_single_list_result(self, url_addition=None, url=None, field=None): if fetch_results.ok: results = fetch_results.json() if results["count"] > 1: - log.error("More than one matching item for fetch query: %s" % url) + log.error("More than one matching item for fetch query: %s", url) elif results["count"] == 0: - log.debug("No matching items for fetch query: %s" % url) + log.debug("No matching items for fetch query: %s", url) else: result = results["results"][0] - log.debug("Single result fetched from %s: %s" % (url, str(result))) + log.debug("Single result fetched from %s: %s", url, result) if field: return result[field] return result else: - log.error("Could not execute api query: %s" % url) + log.error("Could not execute api query: %s", url) return None @@ -164,7 +164,7 @@ def retrieve_file(self, alignment_id, file_purpose): alignment = self.api_single_result("flowcell_lane_alignment/%d/" % alignment_id) if not alignment: - logging.critical("Cannot find alignment %d" % alignment_id) + logging.critical("Cannot find alignment %d", alignment_id) sys.exit(1) logging.debug(alignment) @@ -184,19 +184,23 @@ def retrieve_file(self, alignment_id, file_purpose): if len(files) > 1: logging.critical( - "%d %s files found for alignment %d" - % (len(files), file_purpose["slug"], alignment_id) + "%d %s files found for alignment %d", + len(files), + file_purpose["slug"], + alignment_id, ) sys.exit(1) if len(directories) > 1: logging.critical( - "%d %s directories found for alignment %d" - % (len(directories), file_purpose["slug"], alignment_id) + "%d %s directories found for alignment %d", + len(directories), + file_purpose["slug"], + alignment_id, ) if not files and not directories: logging.critical( - "No files or directories found for alignment %d" % alignment_id + "No files or directories found for alignment %d", alignment_id ) sys.exit(1) @@ -206,7 +210,7 @@ def find_single_alignment(self, lane): ) if len(alignments) > 1: - logging.warn("More than one alignment found, finding default") + logging.warning("More than one alignment found, finding default") for alignment in alignments: if alignment["default_lane_alignment"]: @@ -217,36 +221,36 @@ def find_single_alignment(self, lane): def find_lanes(self, args): query = {} if args.flowcell: - logging.debug("Using flowcell: %s" % args.flowcell) + logging.debug("Using flowcell: %s", args.flowcell) if args.flowcell.startswith("FC"): args.flowcell = args.flowcell[2:] if len(args.flowcell) != 5: - logging.warn( - "Flowcell label %s is not five characters long" % args.flowcell + logging.warning( + "Flowcell label %s is not five characters long", args.flowcell ) query["flowcell__label"] = args.flowcell if args.lane_id: - logging.debug("Using lane id %d" % args.lane_id) + logging.debug("Using lane id %d", args.lane_id) query["id"] = args.lane_id if args.lane: - logging.debug("Using lane %d" % args.lane) + logging.debug("Using lane %d", args.lane) query["lane"] = args.lane if args.library: - logging.debug("Using library %s" % args.library) + logging.debug("Using library %s", args.library) library_number = args.library.strip(string.letters) try: library_number = int(library_number) except ValueError: - logging.critical("Could not turn %s into library number" % args.library) + logging.critical("Could not turn %s into library number", args.library) sys.exit(1) query["library__number"] = library_number if args.sample: - logging.debug("Using sample %s" % args.sample) + logging.debug("Using sample %s", args.sample) sample_number = args.sample.lstrip(string.letters) if sample_number[-1] in string.letters: query["library__sub_library"] = sample_number[-1].upper() @@ -254,7 +258,7 @@ def find_lanes(self, args): try: sample_number = int(sample_number) except ValueError: - logging.critical("Could not turn %s into sample number" % args.sample) + logging.critical("Could not turn %s into sample number", args.sample) query["sample__number"] = sample_number return self.api_list_result( @@ -266,7 +270,7 @@ def retrieve(self, args): file_purpose = self.get_file_purpose(args.file_purpose) if not file_purpose: - logging.critical("Cannot find file purpose %s" % args.file_purpose) + logging.critical("Cannot find file purpose %s", args.file_purpose) sys.exit(1) if args.alignment_id: @@ -281,13 +285,13 @@ def retrieve(self, args): if len(lanes) > 1: logging.critical("More than one lane found for arguments given:") for lane in lanes: - logging.error("\t%d [ %s ]" % (lane["id"], lane["view_url"])) + logging.error("\t%d [ %s ]", lane["id"], lane["view_url"]) sys.exit(1) alignment = self.find_single_alignment(lanes[0]) if not alignment: - logging.critical("Couldn't find an alignment for lane %d" % lanes[0]["id"]) + logging.critical("Couldn't find an alignment for lane %d", lanes[0]["id"]) sys.exit(1) self.retrieve_file(alignment["id"], file_purpose) @@ -313,10 +317,10 @@ def main(args=sys.argv): if not poptions.base_api_url and "LIMS_API_URL" in os.environ: api_url = os.environ["LIMS_API_URL"] - log.debug("Using LIMS API endpoint: %s from environment" % api_url) + log.debug("Using LIMS API endpoint: %s from environment", api_url) elif poptions.base_api_url: api_url = poptions.base_api_url - log.debug("Using LIMS API endpoint: %s from options" % api_url) + log.debug("Using LIMS API endpoint: %s from options", api_url) else: sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) diff --git a/scripts/lims/get_processing.py b/scripts/lims/get_processing.py index adf14f02..f4c7d143 100755 --- a/scripts/lims/get_processing.py +++ b/scripts/lims/get_processing.py @@ -92,7 +92,7 @@ def get_processing_info_project(api_url, token, id, outfile): # then get all AGGs # then get all AGG info - logging.info("Setting up project #%s" % id) + logging.info("Setting up project #%s", id) info = requests.get( "%s/aggregation/file_detail/?library__sample__tissue_culture__project=%s&page_size=1000" @@ -111,7 +111,7 @@ def get_processing_info_project(api_url, token, id, outfile): def get_processing_info_experiment(api_url, token, id, outfile): - logging.info("Setting up experiment #%s" % id) + logging.info("Setting up experiment #%s", id) info = requests.get( "%s/experiment/%s/schema" % (api_url, id), @@ -128,20 +128,21 @@ def get_processing_info_experiment(api_url, token, id, outfile): return -def get_processing_info_alignment_group(api_url, token, id, outfile): +def get_processing_info_alignment_group(api_url, token, group_id, outfile): info = requests.get( - "%s/flowcell_lane_alignment_group/%d/processing_information/" % (api_url, id), + "%s/flowcell_lane_alignment_group/%d/processing_information/" + % (api_url, group_id), headers={"Authorization": "Token %s" % token}, ) if info.ok: result = info.json() - logging.info("Writing results to %s" % outfile) + logging.info("Writing results to %s", outfile) with open(outfile, "w") as output: json.dump(result, output, sort_keys=True, indent=4, separators=(",", ": ")) else: logging.error( - "Could not find processing info for alignment group %s\n" % str(id) + "Could not find processing info for alignment group %s\n", group_id ) return @@ -181,12 +182,12 @@ def main(args=sys.argv): if poptions.project: logging.info( - "Getting aggregation information for project #%s" % poptions.project + "Getting aggregation information for project #%s", poptions.project ) get_processing_info_project(api_url, token, poptions.project, poptions.outfile) if poptions.flowcell: - logging.info("Getting alignment groups for %s" % poptions.flowcell) + logging.info("Getting alignment groups for %s", poptions.flowcell) alignment_groups = requests.get( "%s/flowcell_lane_alignment_group/?flowcell__label=%s" @@ -202,14 +203,13 @@ def main(args=sys.argv): results = alignment_groups.json() if results["count"] == 0: logging.error( - "Could not find an alignment group for flowcell %s\n" - % poptions.flowcell + "Could not find an alignment group for flowcell %s\n", poptions.flowcell ) sys.exit(1) if results["count"] > 1: logging.error( - "More than one alignment group found: %s" - % ", ".join(["%d" % ag["id"] for ag in results["results"]]) + "More than one alignment group found: %s", + ", ".join(["%d" % ag["id"] for ag in results["results"]]), ) sys.exit(1) @@ -224,7 +224,7 @@ def main(args=sys.argv): if poptions.experiment: logging.info( - "Getting aggregation information for experiment #%s" % poptions.experiment + "Getting aggregation information for experiment #%s", poptions.experiment ) get_processing_info_experiment( api_url, token, poptions.experiment, poptions.outfile diff --git a/scripts/lims/upload_aggregation_stats.py b/scripts/lims/upload_aggregation_stats.py index eae09eb8..9463cd51 100755 --- a/scripts/lims/upload_aggregation_stats.py +++ b/scripts/lims/upload_aggregation_stats.py @@ -60,13 +60,13 @@ def upload_stats(api, aggregation, stats): def upload_file(api, aggregation, counts_file): count_content = open(counts_file, "r") - log.info("uploading {}".format(counts_file)) + log.info("uploading %s", counts_file) stats = {} for line in count_content: values = line.split() if len(values) < 2: - log.warn("skipping {}".format(values)) + log.warning("skipping %s", values) continue stat_type_name = values[0] @@ -75,18 +75,18 @@ def upload_file(api, aggregation, counts_file): try: float(value) except ValueError: - log.warn( - "skipping stat-type '{}' with non-numeric value '{}'".format( - stat_type_name, value - ) + log.warning( + "skipping stat-type '%s' with non-numeric value '%s'", + stat_type_name, + value, ) continue if not stat_type_name: - log.warn("skipping {}".format(stat_type_name)) + log.warning("skipping %s", stat_type_name) continue stats[stat_type_name] = value - log.debug("{} : {}".format(stat_type_name, value)) + log.debug("%s : %s", stat_type_name, value) count_content.close() upload_stats(api, aggregation, stats) diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index d786e05b..3fb857cc 100755 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -250,7 +250,7 @@ def split_sample_name(samplename): ) if not m: - log.error("Could not parse sample name: %s" % samplename) + log.error("Could not parse sample name: %s", samplename) return None return { @@ -284,7 +284,7 @@ def get_dup_score(spotdup_file): return percent_duplication except UnboundLocalError as e: - log.error("Unbound Local Error for %s" % spotdup_file) + log.error("Unbound Local Error for %s", spotdup_file) log.error(e) except IndexError as e: log.error(e) @@ -404,23 +404,23 @@ def get_flowcell_url_by_label(self, label): def clear_flowcell_cache(self, flowcell): url = self.get_flowcell_url_by_label(flowcell) if url is None: - log.error("Failure to reset flowcell cache for %s" % flowcell) + log.error("Failure to reset flowcell cache for %s", flowcell) return log.debug(self.api.post_single_result(url=url + "clear_cache/")) def clear_alignment_stats(self, alignment_id): url = "flowcell_lane_alignment/%d/clear_stats/" % alignment_id - log.debug("Clearing stats: %s" % url) + log.debug("Clearing stats: %s", url) results = self.post(url) if results is None: - log.error("Could not clear alignment stats for ALN%s" % alignment_id) + log.error("Could not clear alignment stats for ALN%s", alignment_id) def clear_aggregation_stats(self, aggregation_id): url = "aggregation/%d/clear_stats/" % aggregation_id - log.debug("Clearing stats: %s" % url) + log.debug("Clearing stats: %s", url) results = self.post(url) if results is None: - log.error("Could not clear aggregation stats for AGG%s" % aggregation_id) + log.error("Could not clear aggregation stats for AGG%s", aggregation_id) def start_aggregation(self, aggregation_id): url = "aggregation/%d/" % aggregation_id @@ -429,7 +429,7 @@ def start_aggregation(self, aggregation_id): } results = self.patch(url, data=data) if results is None: - log.error("Could not 'start' AGG%s" % aggregation_id) + log.error("Could not 'start' AGG%s", aggregation_id) def complete_aggregation(self, aggregation_id): url = "aggregation/%d/" % aggregation_id @@ -442,7 +442,7 @@ def complete_aggregation(self, aggregation_id): results = self.patch(url, data=data) if results is None: - log.error("Could not complete AGG%s" % aggregation_id) + log.error("Could not complete AGG%s", aggregation_id) def get_fastqc_tags(self): if not self.fastqc_tags: @@ -479,7 +479,7 @@ def get_contenttype(self, contenttype_name): } ct = self.get_single_result("content_type/", query=query) if not ct: - log.critical("Could not fetch content type %s" % contenttype_name) + log.critical("Could not fetch content type %s", contenttype_name) return ct @@ -498,26 +498,27 @@ def upload_directory_attachment( if not (contenttype_name and object_id): log.error( - "Cannot attach file %s without both content type and object_id" % path + "Cannot attach file %s without both content type and object_id", path ) return False contenttype = self.get_contenttype(contenttype_name) if not contenttype: - log.error("Cannot attach file %s without contenttype result" % path) + log.error("Cannot attach file %s without contenttype result", path) return False purpose = self.get_file_purpose_url(file_purpose) if file_purpose and not purpose: log.error( - "Could not find file purpose %s for uploading directory %s" - % (file_purpose, path) + "Could not find file purpose %s for uploading directory %s", + file_purpose, + path, ) return False elif purpose: - log.debug("File purpose: %s" % purpose) + log.debug("File purpose: %s", purpose) exists = self.get_single_result("directory/", query={"path": path}) @@ -536,14 +537,14 @@ def upload_directory_attachment( ) if exists: - log.info("Updating information for directory %s" % path) + log.info("Updating information for directory %s", path) result = self.put(url=data["url"], data=data) else: - log.info("Uploading information for directory %s" % path) + log.info("Uploading information for directory %s", path) result = self.post("directory/", data=data) if not result: - log.error("Could not upload directory %s" % path) + log.error("Could not upload directory %s", path) log.debug(data) else: log.debug(result) @@ -562,42 +563,45 @@ def upload_file_attachment( path = os.path.abspath(path) log.info( - "Attaching file %s to object %d (contenttype %s)" - % (path, object_id, contenttype_name) + "Attaching file %s to object %d (contenttype %s)", + path, + object_id, + contenttype_name, ) if not (contenttype_name and object_id): log.error( - "Cannot attach file %s without both content type and object_id" % path + "Cannot attach file %s without both content type and object_id", path ) return False contenttype = self.get_contenttype(contenttype_name) if not contenttype: - log.error("Cannot attach file %s without contenttype result" % path) + log.error("Cannot attach file %s without contenttype result", path) return False purpose = self.get_file_purpose_url(file_purpose) if file_purpose and not purpose: log.error( - "Could not find file purpose %s for uploading file %s" - % (file_purpose, path) + "Could not find file purpose %s for uploading file %s", + file_purpose, + path, ) return False elif purpose: - log.debug("File Purpose: %s" % purpose) + log.debug("File Purpose: %s", purpose) ftype = self.get_file_type(file_type) if file_type and not ftype: log.error( - "Could not find file type %s for uploading file %s" % (file_type, path) + "Could not find file type %s for uploading file %s", file_type, path ) return False elif purpose: - log.debug("File Type: %s" % ftype) + log.debug("File Type: %s", ftype) exists = self.get_single_result( "file/", @@ -624,14 +628,16 @@ def upload_file_attachment( <= difference <= datetime.timedelta(minutes=1) ): - log.info("File exists and matches recorded size, skipping %s" % path) + log.info("File exists and matches recorded size, skipping %s", path) return md5sum = md5sum_file(path) log.info( - "MD5sum: %s\tFile size: %d\tLast modified: %s" - % (md5sum, file_size, str(last_modified)) + "MD5sum: %s\tFile size: %d\tLast modified: %s", + md5sum, + file_size, + str(last_modified), ) data = { @@ -648,14 +654,14 @@ def upload_file_attachment( log.debug(data) if exists: - log.info("Updating information for file %s" % path) + log.info("Updating information for file %s", path) result = self.put(url=exists["url"], data=data) else: - log.info("Uploading information for file %s" % path) + log.info("Uploading information for file %s", path) result = self.post("file/", data=data) if not result: - log.error("Could not upload file %s" % path) + log.error("Could not upload file %s", path) log.debug(data) else: log.debug(result) @@ -680,7 +686,7 @@ def get_aggregation_contenttype(self): return self.aggregation_contenttype def create_count_type(self, name): - log.info("Creating count type %s" % name) + log.info("Creating count type %s", name) is_mapq = name.startswith("mapq") is_samflag = name.startswith("samflag") @@ -703,18 +709,18 @@ def create_count_type(self, name): self.count_types[name] = result else: self.count_types[name] = None - log.warn("Could not create count type %s (%s)" % (name, str(result))) + log.warning("Could not create count type %s (%s)", name, result) return self.count_types[name] # TODO : make sure that no more of one count type exists def get_alignment_counts(self, alignment_id): - log.info("Getting alignment counts for %d" % alignment_id) + log.info("Getting alignment counts for %d", alignment_id) if alignment_id not in self.alignment_counts: counts = self.get_list_result( "flowcell_lane_count/", query={"alignment": alignment_id} ) if counts is None: - log.critical("Could not get counts for ALN%d" % alignment_id) + log.critical("Could not get counts for ALN%d", alignment_id) self.alignment_counts[alignment_id] = dict( [(count["count_type_name"], count) for count in counts] ) @@ -735,7 +741,7 @@ def get_rna_metrics(self, alignment_id): "rna_alignment_metrics/", query={"alignment": alignment_id} ) if not exists: - log.error("Error finding RNA metrics for alignment %d" % alignment_id) + log.error("Error finding RNA metrics for alignment %d", alignment_id) return exists def upload_rna_metrics(self, alignment_id, rna_file): @@ -772,10 +778,10 @@ def upload_rna_metrics(self, alignment_id, rna_file): if exists: # Currently (2014-12-22) this will fail, but that's a TODO on the LIMS side. - log.info("Updating RNA metrics for alignment ID %d" % alignment_id) + log.info("Updating RNA metrics for alignment ID %d", alignment_id) result = self.put(url=data["url"], data=data) else: - log.info("Uploading RNA metrics for alignment ID %d" % alignment_id) + log.info("Uploading RNA metrics for alignment ID %d", alignment_id) result = self.post("rna_alignment_metrics/", data=data) log.debug(result) if not result: @@ -786,7 +792,7 @@ def upload_barcode_report(self, barcode_file): try: jsondata = json.loads(datastring) except ValueError: - log.error("Barcode report %s is not valid JSON" % barcode_file) + log.error("Barcode report %s is not valid JSON", barcode_file) return if jsondata["Sequencer"] == "MiniSeq": @@ -827,9 +833,9 @@ def upload_counts(self, alignment_id, counts_file): response = self.bulk_upload_counts(alignment_id, self.parse_counts(counts_file)) if response is None: log.error( - "Bulk upload failed: Counts file {} for ALN{}".format( - counts_file, alignment_id - ) + "Bulk upload failed: Counts file %s for ALN%s", + counts_file, + alignment_id, ) else: log.info("Upload successful.") @@ -897,7 +903,7 @@ def upload_alignment_records( start_time=False, complete_time=False, ): - log.info("Uploading alignment records for %d" % alignment_id) + log.info("Uploading alignment records for %d", alignment_id) if not (adapter_file or version_file or start_time or complete_time): log.debug("No data to upload.") @@ -920,18 +926,16 @@ def upload_alignment_records( result = self.patch(url=alignment["url"], data=alignment) if result: - log.info("Alignment %d updated" % alignment_id) + log.info("Alignment %d updated", alignment_id) log.debug(result) else: - log.debug( - "No result for uploading %s to %s" % (str(alignment), alignment["url"]) - ) + log.debug("No result for uploading %s to %s", alignment, alignment["url"]) return True def upload_spot(self, alignment_id, spot_file, dup_file): if not spot_file and dup_file: - log.error("Error, do not have both files for alignment %s" % alignment_id) + log.error("Error, do not have both files for alignment %s", alignment_id) spot_stats = get_spot_score(spot_file) percent_dup = get_dup_score(dup_file) @@ -954,7 +958,7 @@ def upload_spot(self, alignment_id, spot_file, dup_file): if len(origspots) > 1: log.error("Could not figure out which SPOT score to upload to!") elif len(origspots) == 0: - log.info("Uploading new spot for %d" % alignment_id) + log.info("Uploading new spot for %d", alignment_id) result = self.post("flowcell_lane_spot/", data=data) if not result: log.error("Could not upload SPOT") @@ -966,7 +970,7 @@ def upload_spot(self, alignment_id, spot_file, dup_file): or data["tags_in_hotspots"] != origspot["tags_in_hotspots"] or data["percent_duplication"] != origspot["percent_duplication"] ): - log.info("Updating SPOT score for %d" % alignment_id) + log.info("Updating SPOT score for %d", alignment_id) result = self.patch(url=origspot["url"], data=data) if not result: log.error("Could not upload SPOT") @@ -993,7 +997,7 @@ def upload_fastqc(self, flowcell_lane_id, filename): ) if not m: - log.error("Could not figure out information for %s" % filename) + log.error("Could not figure out information for %s", filename) return False log.info(m.groups()) @@ -1001,7 +1005,7 @@ def upload_fastqc(self, flowcell_lane_id, filename): fastqc_report = self.get_fastqc_contents(filename) if not fastqc_report: - log.error("Could not read fastqc report %s" % filename) + log.error("Could not read fastqc report %s", filename) return False samplename = m.group("samplename") @@ -1041,15 +1045,15 @@ def upload_fastqc(self, flowcell_lane_id, filename): if report: # replace content if "raw_data" not in report or report["raw_data"] != upload["raw_data"]: - log.info("Updating report %s" % upload["label"]) + log.info("Updating report %s", upload["label"]) result = self.patch(url=report["url"], data=upload) if result: log.debug(result) else: - log.error("Could not update FastQC report %s" % report["url"]) + log.error("Could not update FastQC report %s", report["url"]) else: - log.info("Uploading new fastqc report %s" % upload["label"]) + log.info("Uploading new fastqc report %s", upload["label"]) result = self.post("fastqc_report/", data=upload) if result: @@ -1071,7 +1075,7 @@ def upload_fastqc_counts(self, alignment_id): for fastqc_file, fastqc_counts in self.fastqc_counts.items(): if not fastqc_counts: - log.error("Could not get counts from %s for uploading" % fastqc_file) + log.error("Could not get counts from %s for uploading", fastqc_file) return total += fastqc_counts["total"] @@ -1093,14 +1097,14 @@ def upload_picard_metric( try: picard_metric = open(filename, "r").read() except Exception: - log.error("Could not read picard metric file %s" % filename) + log.error("Could not read picard metric file %s", filename) return None - log.debug("Uploading metric contents from: %s" % filename) + log.debug("Uploading metric contents from: %s", filename) log.debug(picard_metric) if metric_name not in self.picard_metrics: - log.error("Could not find metrics type %s" % metric_name) + log.error("Could not find metrics type %s", metric_name) return False metric = self.picard_metrics[metric_name] @@ -1133,7 +1137,7 @@ def upload_picard_metric( if library_info: log.debug(library_info) else: - log.error("Could not fetch %s" % aggregation_info["library"]) + log.error("Could not fetch %s", aggregation_info["library"]) return False label = "AGG%d LN%d %s" % ( aggregation_id, @@ -1172,11 +1176,11 @@ def upload_picard_metric( if existing is not None: result = self.patch(url=existing["url"], json=upload) else: - log.info("Uploading new picard report %s" % upload["label"]) + log.info("Uploading new picard report %s", upload["label"]) result = self.post("picard_report/", json=upload) if not result: - log.error("Could not upload new Picard report %s" % filename) + log.error("Could not upload new Picard report %s", filename) else: log.debug(result) @@ -1201,10 +1205,10 @@ def main(args=sys.argv): if not poptions.base_api_url and "LIMS_API_URL" in os.environ: api_url = os.environ["LIMS_API_URL"] - log.debug("Using LIMS API endpoint: %s from environment" % api_url) + log.debug("Using LIMS API endpoint: %s from environment", api_url) elif poptions.base_api_url: api_url = poptions.base_api_url - log.debug("Using LIMS API endpoint: %s from options" % api_url) + log.debug("Using LIMS API endpoint: %s from options", api_url) else: sys.stderr.write("Could not find LIMS API URL.\n") sys.exit(1) diff --git a/scripts/poolprocess.py b/scripts/poolprocess.py index 2a763d5c..d62f25de 100755 --- a/scripts/poolprocess.py +++ b/scripts/poolprocess.py @@ -209,7 +209,7 @@ def api_single_result(self, url_addition=None, url=None): logging.debug(request.json()) return request.json() else: - logging.error("Could not get data from %s" % url) + logging.error("Could not get data from %s", url) logging.error(request) return None @@ -222,7 +222,7 @@ def api_list_result(self, url_addition=None, url=None): url = "%s/%s" % (self.api_url, url_addition) while more: - logging.debug("Fetching more results for query %s" % url) + logging.debug("Fetching more results for query %s", url) request = self.session.get(url) @@ -245,7 +245,7 @@ def get_align_process_info(self, alignment_id): if not process_info: logging.critical( - "Could not find processing info for alignment %d\n" % alignment_id + "Could not find processing info for alignment %d\n", alignment_id ) logging.critical(process_info) sys.exit(1) @@ -254,14 +254,14 @@ def get_align_process_info(self, alignment_id): def get_process_template(self, align_id, process_template_id): if not process_template_id: - logging.critical("No process template for alignment %d\n" % align_id) + logging.critical("No process template for alignment %d\n", align_id) return None info = self.api_single_result("process_template/%d/" % (process_template_id)) if not info: logging.critical( - "Could not find processing template for ID %d\n" % process_template_id + "Could not find processing template for ID %d\n", process_template_id ) sys.exit(1) @@ -274,10 +274,10 @@ def setup_alignments(self, align_ids, parallel=True): if parallel: for id, error in self.pool.map(self.setup_alignment, align_ids): if error: - logging.error("ALN%d result received, error: %s" % (id, error)) + logging.error("ALN%d result received, error: %s", (id, error)) all_okay = False else: - logging.debug("ALN%d result received, OK" % id) + logging.debug("ALN%d result received, OK", id) if not all_okay: # logging.critical("Errors during setup, exiting") logging.error( @@ -299,10 +299,10 @@ def setup_alignment(self, align_id): self.create_script(processing_info, alignment["id"]) return (align_id, None) else: - logging.info("Skipping completed alignment %d" % align_id) + logging.info("Skipping completed alignment %d", align_id) return (align_id, None) except Exception as e: - logging.exception("Could not set up alignment %s}: (%s)" % (align_id, e)) + logging.exception("Could not set up alignment %s}: (%s)", (align_id, e)) return (align_id, e) def get_lane_file(self, lane_id, purpose): @@ -325,14 +325,14 @@ def setup_tag(self, tag_slug): self.setup_alignments([align_tag["object_id"] for align_tag in align_tags]) def setup_project(self, project_id): - logging.info("Setting up project #%s" % project_id) + logging.info("Setting up project #%s", project_id) alignments = self.api_list_result( "flowcell_lane_alignment/?lane__sample__project=%s" % project_id ) self.setup_alignments([alignment["id"] for alignment in alignments]) def setup_flowcell(self, flowcell_label): - logging.info("Setting up flowcell for %s" % flowcell_label) + logging.info("Setting up flowcell for %s", flowcell_label) align_ids = self.get_alignment_ids(flowcell_label) logging.debug("align ids: %s", align_ids) @@ -479,7 +479,7 @@ def add_script(self, align_id, processing_info, script_file, sample_name): logging.debug("Writing script to stdout") outfile = sys.stdout else: - logging.debug("Logging script to %s" % self.outfile) + logging.debug("Logging script to %s", self.outfile) outfile = open(self.outfile, "a") if self.simple_output: @@ -541,7 +541,7 @@ def create_script(self, processing_info, align_id): alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] if "process_template" not in alignment: - logging.error("Alignment %d has no process template" % align_id) + logging.error("Alignment %d has no process template", align_id) return False process_template = self.get_process_template( @@ -558,8 +558,9 @@ def create_script(self, processing_info, align_id): flowcell_directory = os.path.join(share_dir, "alignments") if not flowcell_directory: logging.error( - "Alignment %d has no flowcell directory for flowcell %s" - % (align_id, processing_info["flowcell"]["label"]) + "Alignment %d has no flowcell directory for flowcell %s", + align_id, + processing_info["flowcell"]["label"], ) return False @@ -605,8 +606,10 @@ def create_script(self, processing_info, align_id): if not r1_fastq: logging.error( - "Missing r1-fastq for lane %d (alignment %d) - check dir %s" - % (lane["id"], alignment["id"], fastq_directory) + "Missing r1-fastq for lane %d (alignment %d) - check dir %s", + lane["id"], + alignment["id"], + fastq_directory, ) return False @@ -614,15 +617,16 @@ def create_script(self, processing_info, align_id): r2_fastq = self.get_lane_file(lane["id"], "r2-fastq") if not r2_fastq: logging.error( - "Missing r2-fastq for lane %d (alignment %d)" - % (lane["id"], alignment["id"]) + "Missing r2-fastq for lane %d (alignment %d)", + lane["id"], + alignment["id"], ) return False script_file = os.path.join( script_directory, "%s-%s" % (alignment["sample_name"], self.qsub_scriptname) ) - logging.info("Will write to %s" % script_file) + logging.info("Will write to %s", script_file) # Set up & add environment variables env_vars = OrderedDict() @@ -663,9 +667,9 @@ def create_script(self, processing_info, align_id): p5_adapter = lane["barcode2"]["adapter5_reverse_complement"] if not p7_adapter or not p5_adapter: - logging.warn( - "Alignment %d missing adapters, some processes might not work" - % alignment["id"] + logging.warning( + "Alignment %d missing adapters, some processes might not work", + alignment["id"], ) env_vars["ADAPTER_P7"] = p7_adapter @@ -692,17 +696,15 @@ def create_script(self, processing_info, align_id): env_vars[var] = value except ValueError: logging.error( - "Could not parse process variables for align %d (template %d): '%s'" - % ( - alignment["id"], - process_template["id"], - process_template["process_variables"], - ) + "Could not parse process variables for align %d (template %d): '%s'", + alignment["id"], + process_template["id"], + process_template["process_variables"], ) return False if self.dry_run: - logging.info("Dry run, would have created: %s" % script_file) + logging.info("Dry run, would have created: %s", script_file) logging.debug(env_vars) self.create_sample_config( processing_info, alignment, script_directory, pool_name @@ -710,7 +712,7 @@ def create_script(self, processing_info, align_id): return True if not os.path.exists(script_directory): - logging.info("Creating directory %s" % script_directory) + logging.info("Creating directory %s", script_directory) os.makedirs(script_directory) # Append to master script @@ -953,7 +955,9 @@ def info_to_data(well_info): wells.append(well_data) return wells - def reverse_complement(bc: "Optional[str]") -> "Optional[str]": # noqa: F821 + def reverse_complement( + bc: "Optional[str]", # noqa: F821 + ) -> "Optional[str]": # noqa: F821 if bc is None: return None lookup = {"A": "T", "T": "A", "C": "G", "G": "C"} @@ -1015,7 +1019,7 @@ def get_num(tl): if match: return int(match.group(1)) else: - logging.warning("Weird talen: '%s'" % tl) + logging.warning("Weird talen: '%s'", tl) return 0 return sorted(tls, key=get_num) diff --git a/scripts/utility/md5check.py b/scripts/utility/md5check.py index 2c7d01eb..b7ba753e 100755 --- a/scripts/utility/md5check.py +++ b/scripts/utility/md5check.py @@ -15,7 +15,7 @@ def check_md5sum(filename, md5sum): - logging.debug("Checking file %s matches %s" % (filename, md5sum)) + logging.debug("Checking file %s matches %s", filename, md5sum) current_md5sum = subprocess.check_output( ["md5sum", filename], stderr=subprocess.STDOUT, universal_newlines=True ).split()[0] @@ -24,8 +24,10 @@ def check_md5sum(filename, md5sum): if not match: logging.error( - "md5sum for file %s does not match: %s recorded, %s as exists" - % (filename, md5sum, current_md5sum) + "md5sum for file %s does not match: %s recorded, %s as exists", + filename, + md5sum, + current_md5sum, ) return match diff --git a/scripts/utility/movesymlinks.py b/scripts/utility/movesymlinks.py index eaf44466..ef1c3128 100755 --- a/scripts/utility/movesymlinks.py +++ b/scripts/utility/movesymlinks.py @@ -69,12 +69,12 @@ def detect(self, path): path = path.rstrip("/") if not os.path.islink(path): - logging.debug("%s not a symlink" % path) + logging.debug("%s not a symlink", path) return target_path = os.readlink(path) broken = False - logging.debug("checking %s" % path) + logging.debug("checking %s", path) # Resolve relative symlinks if not os.path.isabs(target_path): target_path_absolute = os.path.join(os.path.dirname(path), target_path) @@ -82,7 +82,7 @@ def detect(self, path): target_path_absolute = target_path self.alllinks.append(path) if self.olddir in target_path: - logging.debug("path %s target %s" % (path, target_path_absolute)) + logging.debug("path %s target %s", path, target_path_absolute) self.movedlinks.append(path) if not os.path.exists(target_path_absolute): broken = True @@ -99,24 +99,26 @@ def move_link(self, linkpath): new_target_path = old_target_path.replace(self.olddir, self.newdir) logging.info( - "Moving %s pointer from %s to %s" - % (linkpath, old_target_path, new_target_path) + "Moving %s pointer from %s to %s", + linkpath, + old_target_path, + new_target_path, ) try: if self.domove: os.unlink(linkpath) os.symlink(new_target_path, linkpath) except PermissionError: - logging.error("Couldn't move %s, permission denied" % linkpath) + logging.error("Couldn't move %s, permission denied", linkpath) def walk(self, directory): for root, dirs, files in os.walk(directory): if root.startswith("./.git"): # Ignore the .git directory. continue - logging.debug("walking through directories for %s" % root) + logging.debug("walking through directories for %s", root) [self.detect(os.path.join(root, dirname)) for dirname in dirs] - logging.debug("walking through files for %s" % root) + logging.debug("walking through files for %s", root) [self.detect(os.path.join(root, filename)) for filename in files] def run(self, report=None): @@ -127,13 +129,13 @@ def run(self, report=None): logging.info("Detecting symlinks") self.walk(self.fromdir) - logging.info("%d symlinks found in total" % len(self.alllinks)) + logging.info("%d symlinks found in total", len(self.alllinks)) if self.brokenlinks: logging.info("broken symlink(s) found:") for link in self.brokenlinks: - logging.info("\t%s" % link) + logging.info("\t%s", link) if self.movedlinks: - logging.info("%s symlinks to move" % len(self.movedlinks)) + logging.info("%s symlinks to move", len(self.movedlinks)) logging.info("Symlink moves...") [self.move_link(link) for link in self.movedlinks] From 9eeb4b159d3e8f152b4895a264d851f818baf693 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Thu, 12 Sep 2024 11:40:47 -0700 Subject: [PATCH 156/172] First draft of fastq pipeline container --- containers/fastq/fastq.def | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 containers/fastq/fastq.def diff --git a/containers/fastq/fastq.def b/containers/fastq/fastq.def new file mode 100644 index 00000000..6bd8ad8b --- /dev/null +++ b/containers/fastq/fastq.def @@ -0,0 +1,47 @@ +Bootstrap: docker +From: rockylinux:8.6 + +%arguments + FASTQC_VERSION=v0.11.5 + +%help + This container contains the necessary software to run the first stage of + the pipeline, beginning from retriving metadata from LIMS, running + bcl2fastq and demultiplexing, collating fastq files, running FastQC, and + registering output with LIMS. + +%files + ./bcl2fastq /usr/local/bin/bcl2fastq + ../../scripts/lims/stamlims_api /opt/ + +%post + set -e + dnf install -y python39 + ln -s /usr/bin/python3 /usr/bin/python + ls /opt/stamlims_api/ + + python -m pip install /opt/stamlims_api/ + dnf install -y rsync + + # install fastqc + # https://github.com/s-andrews/FastQC/ + ( + dnf install -y unzip perl java + cd /opt + curl https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_{{ FASTQC_VERSION }}.zip --output fastqc.zip + unzip fastqc.zip + rm fastqc.zip + chmod +x FastQC/fastqc + ln -s /opt/FastQC/fastqc /usr/local/bin/fastqc + ) + + +%test + set -e + bcl2fastq --version + rsync --version + fastqc --version + gzip --version + python3 --version + +# vim: noexpandtab ts=4 sts=4 sw=4 From 41ce2051a4b0741ad2abcebdd114adcfd6ded8c6 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 17 Sep 2024 08:06:24 -0700 Subject: [PATCH 157/172] Add biopython to fastq container --- containers/fastq/fastq.def | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/containers/fastq/fastq.def b/containers/fastq/fastq.def index 6bd8ad8b..3d6a1d11 100644 --- a/containers/fastq/fastq.def +++ b/containers/fastq/fastq.def @@ -18,9 +18,8 @@ From: rockylinux:8.6 set -e dnf install -y python39 ln -s /usr/bin/python3 /usr/bin/python - ls /opt/stamlims_api/ - python -m pip install /opt/stamlims_api/ + python -m pip install Biopython dnf install -y rsync # install fastqc From 21ee6a2c23bc3fe51a00238bd156f7cfdeaac375 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 18 Sep 2024 07:58:31 -0700 Subject: [PATCH 158/172] Update script to use apptainer when able --- scripts/flowcells/setup.sh | 118 ++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index d85fc299..faff7e3a 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -2,15 +2,27 @@ # shellcheck disable=SC1090 # shellcheck disable=SC2162 +DEFAULT_QUEUE="${DEFAULT_QUEUE:-hpcz-test}" # hpcz-2 on old cluster +SLOW_QUEUE="${SLOW_QUEUE:-hpcz-test}" # used to be queue0 + set -o errexit set -o pipefail # Dependencies -source "$MODULELOAD" -source "$PYTHON3_ACTIVATE" +[[ -s "$MODULELOAD" ]] && source "$MODULELOAD" +[[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source "$STAMPIPES/scripts/sentry/sentry-lib.bash" +# Run in apptainer if necessary +if command -v apptainer ; then + echo "Using apptainer" + export apx="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" +else + echo "Not using apptainer" + export apx= +fi + ######### # Options ######### @@ -194,7 +206,7 @@ source "$STAMPIPES/scripts/lims/api_functions.sh" ) # Get and read the processing script -python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" +$apx python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" run_type=$( jq -r '.flowcell.run_type' "$json" ) analysis_dir=$( jq -r '.alignment_group.directory' "$json" ) mask=$( jq -r '.alignment_group.bases_mask' "$json" ) @@ -228,7 +240,7 @@ if [[ "$read1length" = "0" ]] ; then #!/bin/bash sbatch --cpus 1 \ --mem '4G' \ - --partition hpcz-2 \ + --partition "$DEFAULT_QUEUE" \ --job-name "altseq-$flowcell-supervisor" < SampleSheet.csv bcl_tasks=1 unaligned_command=$regular_bcl_command @@ -473,11 +485,11 @@ case $run_type in "HiSeq 4000") echo "Hiseq 4000 run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--hiseq4k" - queue="queue0" + queue="$SLOW_QUEUE" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1-8 unaligned_command=$regular_bcl_command @@ -486,11 +498,11 @@ case $run_type in # Identical to nextseq processing echo "High-output MiniSeq run detected for DNase" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue0" + queue="$SLOW_QUEUE" make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 unaligned_command=$regular_bcl_command @@ -499,11 +511,11 @@ case $run_type in # Identical to nextseq processing echo "Mid-output MiniSeq run detected for GUIDEseq" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue0" + queue="$SLOW_QUEUE" minidemux="True" # placeholder cp /home/dchee7/projects/guide-seq/data/samplesheets/SampleSheet.csv SampleSheet.csv @@ -521,11 +533,11 @@ _U_ # Identical to nextseq processing echo "Mid-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue0" + queue="$SLOW_QUEUE" minidemux="True" make_miniseq_samplesheet > SampleSheet.csv bcl_tasks=1 @@ -542,11 +554,11 @@ _U_ # Identical to nextseq processing echo "High-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" - queue="queue0" + queue="$SLOW_QUEUE" minidemux="True" # placeholder cat /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv > SampleSheet.csv @@ -623,13 +635,13 @@ cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -source $MODULELOAD +[[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load bcl2fastq2/2.17.1.14 -source $PYTHON3_ACTIVATE +[[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source $STAMPIPES/scripts/lims/api_functions.sh # Register the file directory -python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +$apx python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -692,13 +704,13 @@ fi cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash -source $MODULELOAD +[[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load bcl2fastq2/2.20.0.422 -source $PYTHON3_ACTIVATE +[[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source $STAMPIPES/scripts/lims/api_functions.sh # Register the file directory -python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +$apx python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -715,13 +727,13 @@ lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flo lims_patch "flowcell_run/$flowcell_id/" "folder_name=${PWD##*/}" # Submit a barcode job for each mask -for bcmask in $(python $STAMPIPES/scripts/flowcells/barcode_masks.py | xargs) ; do +for bcmask in $($apx python $STAMPIPES/scripts/flowcells/barcode_masks.py | xargs) ; do export bcmask bcjobid=\$(sbatch --export=ALL -J "bc-$flowcell" -o "bc-$flowcell.o%A" -e "bc-$flowcell.e%A" --partition=$queue --cpus-per-task=10 --ntasks=1 --mem-per-cpu=6400 --parsable --oversubscribe --mail-type=FAIL --mail-user=sequencing@altius.org <<'__BARCODES__' #!/bin/bash bcl_barcode_count --mask=\$bcmask $bc_flag > barcodes.\$bcmask.json -python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json -bctest=\$(python $STAMPIPES/scripts/flowcells/barcode_check.py --barcodes barcodes.\$bcmask.json --processing processing.json --bcmask \$bcmask) +$apx python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json +bctest=\$($apx python $STAMPIPES/scripts/flowcells/barcode_check.py --barcodes barcodes.\$bcmask.json --processing processing.json --bcmask \$bcmask) if [ \$bctest = "FALSE" ]; then exit 1 @@ -744,8 +756,8 @@ __BCL2FASTQ__ cat > run_bcl2fastq_2.sh <<__BCL2FASTQ2__ # !/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -source "$MODULELOAD" -source "$PYTHON3_ACTIVATE" +[[ -s "$MODULELOAD" ]] && source "$MODULELOAD" +[[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source "$STAMPIPES/scripts/lims/api_functions.sh" if [[ -n "$demux" ]] ; then @@ -768,7 +780,7 @@ for i in "\${inputfiles[@]}" ; do jobid=\$(sbatch --export=ALL -J dmx\$(basename "\$i") -o .dmx\$(basename "\$i").o%A -e .dmx\$(basename "\$i").e%A --partition $queue --ntasks=1 --cpus-per-task=1 --mem-per-cpu=4000 --parsable --oversubscribe <<__DEMUX__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" - python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ + $apx python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ \$suffix \ --processing "$json" \ --outdir "$copy_from_dir" \ @@ -846,7 +858,7 @@ cd "$analysis_dir" rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash # Create fastqc scripts -python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +$apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/fastqc.bash" \ --qsub-prefix .fq \ --queue $queue \ @@ -855,7 +867,7 @@ python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --outfile fastqc.bash # Create collation scripts -python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +$apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/collate_fastq.bash" \ --qsub-prefix .collatefq \ --queue $queue \ @@ -863,7 +875,7 @@ python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --flowcell_label "$flowcell" \ --outfile collate.bash -bash collate.bash +$apx bash collate.bash # Wait for collation jobs to finish while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do @@ -871,16 +883,16 @@ while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do done # Run fastQC -bash fastqc.bash +$apx bash fastqc.bash # Set up of flowcell alignments -python3 "$STAMPIPES/scripts/alignprocess.py" \ +$apx python3 "$STAMPIPES/scripts/alignprocess.py" \ --flowcell "$flowcell" \ --auto_aggregate \ - --qsub-queue queue0 \ + --qsub-queue $SLOW_QUEUE \ --outfile run_alignments.bash -python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash +$apx python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash # Set up of flowcell aggregations curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" From 8cb09012f74936e4ba54a7bcf5731d7e1f7ea51e Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 22 Sep 2024 10:12:03 -0700 Subject: [PATCH 159/172] collation/fastq updates for container --- processes/fastq/collate_fastq.bash | 14 +++++- processes/fastq/fastqc.bash | 20 ++++++-- scripts/flowcells/setup.sh | 73 ++++++++++++++++-------------- 3 files changed, 65 insertions(+), 42 deletions(-) diff --git a/processes/fastq/collate_fastq.bash b/processes/fastq/collate_fastq.bash index 8cdcc46a..990e2fc9 100644 --- a/processes/fastq/collate_fastq.bash +++ b/processes/fastq/collate_fastq.bash @@ -4,6 +4,16 @@ # Ensure that script failures have the script quit before it deletes files set -e +CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') +if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then + module load apptainer/1.3.3 + echo "# Using apptainer" + export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" +else + echo "# Not using apptainer" + export APX= +fi + cd $FASTQ_DIR FASTQ_NAME=${FLOWCELL}_${SAMPLE_NAME} @@ -28,10 +38,10 @@ R2_FILE=${FASTQ_NAME}_R2.fastq.gz function upload { if [[ "$SAMPLE_NAME" == LP* ]] ; then # Altcode sample, use dedicated script - python3 "$STAMPIPES/scripts/altcode/upload_fastq.py" --lane "$FLOWCELL_LANE_ID" --r1 "$R1_FILE" --r2 "$R2_FILE" + $APX python3 "$STAMPIPES/scripts/altcode/upload_fastq.py" --lane "$FLOWCELL_LANE_ID" --r1 "$R1_FILE" --r2 "$R2_FILE" else # Regular sample, upload old-style - UPLOAD_SCRIPT="python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" + UPLOAD_SCRIPT="$APX python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" $UPLOAD_SCRIPT --attach_file_purpose r1-fastq --attach_file "${R1_FILE}" if [ -e "$R2_FILE" ]; then diff --git a/processes/fastq/fastqc.bash b/processes/fastq/fastqc.bash index b9259217..538184c5 100644 --- a/processes/fastq/fastqc.bash +++ b/processes/fastq/fastqc.bash @@ -1,5 +1,5 @@ # Dependencies -source $MODULELOAD +[[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load jdk/1.8.0_92 module load picard/2.8.1 module load fastqc/0.11.5 @@ -13,6 +13,16 @@ export TOP_UMIS=${SAMPLE_NAME}.topumis.txt.gz cd $FASTQ_DIR +CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') +if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then + module load apptainer/1.3.3 + echo "# Using apptainer" + export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" +else + echo "# Not using apptainer" + export APX= +fi + if [ ! -e "$R1_FASTQC" -o ! -e "$R2_FASTQC" ]; then set -x -e -o pipefail @@ -24,9 +34,9 @@ if [ ! -e "$R1_FASTQC" -o ! -e "$R2_FASTQC" ]; then date cd $FASTQ_DIR - make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R1_FASTQ FASTQC_FILE=$R1_FASTQC + $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R1_FASTQ FASTQC_FILE=$R1_FASTQC if [ "$PAIRED" = "True" ]; then - make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R2_FASTQ FASTQC_FILE=$R2_FASTQC + $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R2_FASTQ FASTQC_FILE=$R2_FASTQC fi if [ "$UMI" = "True" ]; then @@ -35,10 +45,10 @@ if [ ! -e "$R1_FASTQC" -o ! -e "$R2_FASTQC" ]; then fi if [ "$PAIRED" = "True" ]; then - python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ + $APX python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ --fastqcfile $R1_FASTQC --fastqcfile $R2_FASTQC else - python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ + $APX python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ --fastqcfile $R1_FASTQC fi diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index faff7e3a..6bbe02d8 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -14,15 +14,18 @@ set -o pipefail source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -# Run in apptainer if necessary -if command -v apptainer ; then - echo "Using apptainer" - export apx="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" +CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') +if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then + module load apptainer/1.3.3 + echo "# Using apptainer" + # Warning: if STAMPIPES contains spaces or glob chars this will likely break + export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" else - echo "Not using apptainer" - export apx= + echo "# Not using apptainer" + export APX= fi + ######### # Options ######### @@ -206,7 +209,7 @@ source "$STAMPIPES/scripts/lims/api_functions.sh" ) # Get and read the processing script -$apx python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" +$APX python3 "$STAMPIPES/scripts/lims/get_processing.py" -f "$flowcell" -o "$json" run_type=$( jq -r '.flowcell.run_type' "$json" ) analysis_dir=$( jq -r '.alignment_group.directory' "$json" ) mask=$( jq -r '.alignment_group.bases_mask' "$json" ) @@ -254,7 +257,7 @@ fi if [ -z "$demux" ] ; then bcl_mask=$mask - mismatches=$($apx python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes --allow_collisions) + mismatches=$($APX python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes --allow_collisions) if [ "$has_umi" == "true" ] ; then echo "---WARNING---" echo "Flowcell contains UMI samples, but -d param was not specified" @@ -265,7 +268,7 @@ if [ -z "$demux" ] ; then else # Set some options for manual demultiplexing bcl_mask=$(tr Nn Ii <<< $mask) mismatches="0,0" - dmx_mismatches=$($apx python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes | cut -c1 ) + dmx_mismatches=$($APX python3 $STAMPIPES/scripts/flowcells/max_mismatch.py --ignore_failed_lanes | cut -c1 ) fi # Long command definitions @@ -350,7 +353,7 @@ _NOVA_SUBMIT_CMD_ read -d '' novaseq_link_command <<'_NOVA_LINK_CMD_' for fq_dir in fastq-withmask-* ; do [[ -d $fq_dir ]] || continue - $apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_dir" -o Demultiplexed -p processing.json + $APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i "$fq_dir" -o Demultiplexed -p processing.json done _NOVA_LINK_CMD_ set -e @@ -373,7 +376,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -388,7 +391,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -403,7 +406,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -418,7 +421,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -433,7 +436,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -448,7 +451,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 #unaligned_command=$novaseq_bcl_command submit_bcl2fastq_cmd=$novaseq_submit_command @@ -464,7 +467,7 @@ case $run_type in fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--novaseq" queue="$DEFAULT_QUEUE" - $apx python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json + $APX python "$STAMPIPES/scripts/flowcells/make_samplesheets.py" --reverse_barcode1 -p processing.json bcl_tasks=1 unaligned_command=$novaseq_bcl_command @@ -473,7 +476,7 @@ case $run_type in echo "Regular NextSeq 500 run detected" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--nextseq" @@ -485,7 +488,7 @@ case $run_type in "HiSeq 4000") echo "Hiseq 4000 run detected" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--hiseq4k" @@ -498,7 +501,7 @@ case $run_type in # Identical to nextseq processing echo "High-output MiniSeq run detected for DNase" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -511,7 +514,7 @@ case $run_type in # Identical to nextseq processing echo "Mid-output MiniSeq run detected for GUIDEseq" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -533,7 +536,7 @@ _U_ # Identical to nextseq processing echo "Mid-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -554,7 +557,7 @@ _U_ # Identical to nextseq processing echo "High-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="$apx python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -641,7 +644,7 @@ module load bcl2fastq2/2.17.1.14 source $STAMPIPES/scripts/lims/api_functions.sh # Register the file directory -$apx python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -710,7 +713,7 @@ module load bcl2fastq2/2.20.0.422 source $STAMPIPES/scripts/lims/api_functions.sh # Register the file directory -$apx python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -727,13 +730,13 @@ lims_patch "flowcell_run/$flowcell_id/" "status=https://lims.stamlab.org/api/flo lims_patch "flowcell_run/$flowcell_id/" "folder_name=${PWD##*/}" # Submit a barcode job for each mask -for bcmask in $($apx python $STAMPIPES/scripts/flowcells/barcode_masks.py | xargs) ; do +for bcmask in $($APX python $STAMPIPES/scripts/flowcells/barcode_masks.py | xargs) ; do export bcmask bcjobid=\$(sbatch --export=ALL -J "bc-$flowcell" -o "bc-$flowcell.o%A" -e "bc-$flowcell.e%A" --partition=$queue --cpus-per-task=10 --ntasks=1 --mem-per-cpu=6400 --parsable --oversubscribe --mail-type=FAIL --mail-user=sequencing@altius.org <<'__BARCODES__' #!/bin/bash bcl_barcode_count --mask=\$bcmask $bc_flag > barcodes.\$bcmask.json -$apx python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json -bctest=\$($apx python $STAMPIPES/scripts/flowcells/barcode_check.py --barcodes barcodes.\$bcmask.json --processing processing.json --bcmask \$bcmask) +$APX python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json +bctest=\$($APX python $STAMPIPES/scripts/flowcells/barcode_check.py --barcodes barcodes.\$bcmask.json --processing processing.json --bcmask \$bcmask) if [ \$bctest = "FALSE" ]; then exit 1 @@ -780,7 +783,7 @@ for i in "\${inputfiles[@]}" ; do jobid=\$(sbatch --export=ALL -J dmx\$(basename "\$i") -o .dmx\$(basename "\$i").o%A -e .dmx\$(basename "\$i").e%A --partition $queue --ntasks=1 --cpus-per-task=1 --mem-per-cpu=4000 --parsable --oversubscribe <<__DEMUX__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" - $apx python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ + $APX python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ \$suffix \ --processing "$json" \ --outdir "$copy_from_dir" \ @@ -858,7 +861,7 @@ cd "$analysis_dir" rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash # Create fastqc scripts -$apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/fastqc.bash" \ --qsub-prefix .fq \ --queue $queue \ @@ -867,7 +870,7 @@ $apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --outfile fastqc.bash # Create collation scripts -$apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/collate_fastq.bash" \ --qsub-prefix .collatefq \ --queue $queue \ @@ -875,7 +878,7 @@ $apx python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --flowcell_label "$flowcell" \ --outfile collate.bash -$apx bash collate.bash +bash collate.bash # Wait for collation jobs to finish while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do @@ -883,16 +886,16 @@ while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do done # Run fastQC -$apx bash fastqc.bash +bash fastqc.bash # Set up of flowcell alignments -$apx python3 "$STAMPIPES/scripts/alignprocess.py" \ +$APX python3 "$STAMPIPES/scripts/alignprocess.py" \ --flowcell "$flowcell" \ --auto_aggregate \ --qsub-queue $SLOW_QUEUE \ --outfile run_alignments.bash -$apx python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash +$APX python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash # Set up of flowcell aggregations curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" From de4d6879e59e92fe9101711ad249d4a5b7a29a07 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Oct 2024 10:53:43 -0700 Subject: [PATCH 160/172] rework for run-time cluster loc detection previously the code generated would depend solely on where setup.sh was run, so if the execution didn't match, you would get surprising errors --- scripts/flowcells/setup.sh | 108 ++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 32 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 6bbe02d8..a3e90add 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -5,6 +5,9 @@ DEFAULT_QUEUE="${DEFAULT_QUEUE:-hpcz-test}" # hpcz-2 on old cluster SLOW_QUEUE="${SLOW_QUEUE:-hpcz-test}" # used to be queue0 +ALIGN_NODE="${ALIGN_NODE:-dev2.altiusinstitute.org}" # Which node to run alignments on, curerntly should be on "old cluster" +OLD_SLOW_QUEUE=${OLD_SLOW_QUEUE:-queue0} + set -o errexit set -o pipefail @@ -14,16 +17,32 @@ set -o pipefail source "$STAMPIPES/scripts/sentry/sentry-lib.bash" -CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') -if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then - module load apptainer/1.3.3 - echo "# Using apptainer" - # Warning: if STAMPIPES contains spaces or glob chars this will likely break - export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" -else - echo "# Not using apptainer" - export APX= -fi +# Define code for checking if we are running on the new or old cluster +# These are defined as functions so that we can copy them to our other scripts with `$(declare -f name_of_func)` +on_new_cluster () { + local clustername + clustername=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') + # TODO: Can we extract 'altius-gene' to a variable at the top of setup.sh? + [[ "$clustername" == "altius-gene" ]] +} + +set_cluster_vars () { + if on_new_cluster ; then + echo "# Using apptainer" + module load apptainer/1.3.3 + export ON_NEW_CLUSTER=1 + # Warning: if STAMPIPES contains spaces or glob chars this will likely break + export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" + export LOAD_APPTAINER="module load apptainer/1.3.3" + else + echo "# Not using apptainer" + unset ON_NEW_CLUSTER + export APX= + export LOAD_APPTAINER= + fi +} +# Immediately export our variables +set_cluster_vars ######### @@ -476,7 +495,7 @@ case $run_type in echo "Regular NextSeq 500 run detected" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--nextseq" @@ -488,7 +507,7 @@ case $run_type in "HiSeq 4000") echo "Hiseq 4000 run detected" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o ." samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--hiseq4k" @@ -501,7 +520,7 @@ case $run_type in # Identical to nextseq processing echo "High-output MiniSeq run detected for DNase" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -514,7 +533,7 @@ case $run_type in # Identical to nextseq processing echo "Mid-output MiniSeq run detected for GUIDEseq" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -536,7 +555,7 @@ _U_ # Identical to nextseq processing echo "Mid-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -557,7 +576,7 @@ _U_ # Identical to nextseq processing echo "High-output MiniSeq run detected" parallel_env="-pe threads 6" - link_command="$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" + link_command="\$APX python3 $STAMPIPES/scripts/flowcells/link_nextseq.py -i fastq -o . --merge-across-lanes" samplesheet="SampleSheet.csv" fastq_dir="$illumina_dir/fastq" # Lack of trailing slash is important for rsync! bc_flag="--miniseq" @@ -638,13 +657,18 @@ cat > run_bcl2fastq.sh <<__BCL2FASTQ__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" +$(declare -f on_new_cluster) +$(declare -f set_cluster_vars) +set_cluster_vars + [[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load bcl2fastq2/2.17.1.14 +\$LOAD_APPTAINER [[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source $STAMPIPES/scripts/lims/api_functions.sh # Register the file directory -$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +\$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -709,11 +733,16 @@ cat > run_bcl2fastq.sh <<__BCL2FASTQ__ [[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load bcl2fastq2/2.20.0.422 +\$LOAD_APPTAINER [[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source $STAMPIPES/scripts/lims/api_functions.sh +$(declare -f on_new_cluster) +$(declare -f set_cluster_vars) +set_cluster_vars + # Register the file directory -$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ +\$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ --attach_file_contenttype SequencingData.flowcellrun \ --attach_file_purpose flowcell-directory \ @@ -735,7 +764,7 @@ for bcmask in $($APX python $STAMPIPES/scripts/flowcells/barcode_masks.py | xarg bcjobid=\$(sbatch --export=ALL -J "bc-$flowcell" -o "bc-$flowcell.o%A" -e "bc-$flowcell.e%A" --partition=$queue --cpus-per-task=10 --ntasks=1 --mem-per-cpu=6400 --parsable --oversubscribe --mail-type=FAIL --mail-user=sequencing@altius.org <<'__BARCODES__' #!/bin/bash bcl_barcode_count --mask=\$bcmask $bc_flag > barcodes.\$bcmask.json -$APX python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json +\$APX python3 $STAMPIPES/scripts/lims/upload_data.py --barcode_report barcodes.\$bcmask.json bctest=\$($APX python $STAMPIPES/scripts/flowcells/barcode_check.py --barcodes barcodes.\$bcmask.json --processing processing.json --bcmask \$bcmask) if [ \$bctest = "FALSE" ]; then @@ -763,6 +792,10 @@ source "$STAMPIPES/scripts/sentry/sentry-lib.bash" [[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source "$STAMPIPES/scripts/lims/api_functions.sh" +$(declare -f on_new_cluster) +$(declare -f set_cluster_vars) +set_cluster_vars + if [[ -n "$demux" ]] ; then # demultiplex if [ -d "$fastq_dir.L001" ] ; then @@ -783,7 +816,8 @@ for i in "\${inputfiles[@]}" ; do jobid=\$(sbatch --export=ALL -J dmx\$(basename "\$i") -o .dmx\$(basename "\$i").o%A -e .dmx\$(basename "\$i").e%A --partition $queue --ntasks=1 --cpus-per-task=1 --mem-per-cpu=4000 --parsable --oversubscribe <<__DEMUX__ #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" - $APX python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ + \$LOAD_APPTAINER + \$APX python3 $STAMPIPES/scripts/flowcells/demux_fastq.py \ \$suffix \ --processing "$json" \ --outdir "$copy_from_dir" \ @@ -856,24 +890,26 @@ sbatch --export=ALL -J "collate-$flowcell" \$copy_dependency -o "collate-$flowce #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" +\$LOAD_APPTAINER + cd "$analysis_dir" # Remove existing scripts if they exist (to avoid appending) rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash # Create fastqc scripts -$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +\$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/fastqc.bash" \ --qsub-prefix .fq \ - --queue $queue \ + --queue "$queue" \ --sample-script-basename fastqc.bash \ --flowcell_label "$flowcell" \ --outfile fastqc.bash # Create collation scripts -$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ +\$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ --script_template "$STAMPIPES/processes/fastq/collate_fastq.bash" \ --qsub-prefix .collatefq \ - --queue $queue \ + --queue "$queue" \ --sample-script-basename "collate.bash" \ --flowcell_label "$flowcell" \ --outfile collate.bash @@ -889,20 +925,28 @@ done bash fastqc.bash # Set up of flowcell alignments -$APX python3 "$STAMPIPES/scripts/alignprocess.py" \ - --flowcell "$flowcell" \ - --auto_aggregate \ - --qsub-queue $SLOW_QUEUE \ +\$APX python3 "$STAMPIPES/scripts/alignprocess.py" \ + --flowcell "$flowcell" \ + --auto_aggregate \ + --qsub-queue "$OLD_SLOW_QUEUE" \ --outfile run_alignments.bash -$APX python3 "$STAMPIPES/scripts/poolprocess.py" --flowcell "$flowcell" --outfile run_pools.bash +\$APX python3 "$STAMPIPES/scripts/poolprocess.py" \ + --flowcell "$flowcell" \ + --qsub-queue "$OLD_SLOW_QUEUE" \ + --outfile run_pools.bash # Set up of flowcell aggregations curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" -# Run alignments -bash run_alignments.bash -bash run_pools.bash +if on_new_cluster ; then + ssh "$ALIGN_NODE" bash -c "cd \$PWD && bash run_alignments.bash" + ssh "$ALIGN_NODE" bash -c "cd \$PWD && bash run_pools.bash" +else + # Run alignments + bash run_alignments.bash + bash run_pools.bash +fi __COLLATE__ From 6255a8e19996c5047bd2d141fda329680af6f989 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Oct 2024 13:05:14 -0700 Subject: [PATCH 161/172] bcl2fastq actually runs in apptainer now --- scripts/flowcells/setup.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index a3e90add..6919b8f9 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -298,7 +298,7 @@ fi set +e read -d '' regular_bcl_command << _REG_BCL_CMD_ PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --use-bases-mask "$bcl_mask" \\\\ --output-dir "$fastq_dir" \\\\ @@ -313,7 +313,7 @@ read -d '' novaseq_bcl_command << _NOVA_BCL_CMD_ for samplesheet in SampleSheet.withmask*csv ; do bcl_mask=\$(sed 's/.*withmask\\.//;s/\\.csv//' <<< \$samplesheet) fastq_dir=\$(sed 's/,/-/g' <<< "fastq-withmask-\$bcl_mask") - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "${illumina_dir}/\$fastq_dir" \\\\ --use-bases-mask "\$bcl_mask" \\\\ @@ -348,7 +348,7 @@ for samplesheet in SampleSheet.withmask*csv ; do set -x -e -o pipefail cd "${illumina_dir}" PATH=/home/nelsonjs/src/bcl2fastq2/bin/:\$PATH - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "${illumina_dir}/\\\$fastq_dir" \\\\ --use-bases-mask "\\\$bcl_mask" \\\\ @@ -544,7 +544,7 @@ case $run_type in bcl_tasks=1 set +e read -d '' unaligned_command << _U_ - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "$fastq_dir" \\\\ --create-fastq-for-index-reads @@ -565,7 +565,7 @@ _U_ bcl_tasks=1 set +e read -d '' unaligned_command << _U_ - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "$fastq_dir" \\\\ --no-lane-splitting @@ -588,7 +588,7 @@ _U_ bcl_tasks=1 set +e read -d '' unaligned_command << _U_ - bcl2fastq \\\\ + \$APX bcl2fastq \\\\ --input-dir "${illumina_dir}/Data/Intensities/BaseCalls" \\\\ --output-dir "$fastq_dir" \\\\ --no-lane-splitting @@ -733,7 +733,6 @@ cat > run_bcl2fastq.sh <<__BCL2FASTQ__ [[ -s "$MODULELOAD" ]] && source "$MODULELOAD" module load bcl2fastq2/2.20.0.422 -\$LOAD_APPTAINER [[ -s "$PYTHON3_ACTIVATE" ]] && source "$PYTHON3_ACTIVATE" source $STAMPIPES/scripts/lims/api_functions.sh @@ -741,6 +740,8 @@ $(declare -f on_new_cluster) $(declare -f set_cluster_vars) set_cluster_vars +\$LOAD_APPTAINER + # Register the file directory \$APX python3 "$STAMPIPES/scripts/lims/upload_data.py" \ --attach_directory "$analysis_dir" \ From d8ef59d8dd2a95676f87ddc7bfddbf904038b11c Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 15 Oct 2024 12:32:34 -0700 Subject: [PATCH 162/172] Final tweaks Fix occasional oom error, cleanup run_pools.sh properly, and make sure run_pool.sh and run_alignments.sh are always properly submitted. --- scripts/flowcells/setup.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 6919b8f9..e36f06d9 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -887,15 +887,19 @@ if [[ -n \$copy_jobid ]]; then fi # Collate -sbatch --export=ALL -J "collate-$flowcell" \$copy_dependency -o "collate-$flowcell.o%A" -e "collate-$flowcell.e%A" --partition=$queue --cpus-per-task=1 --ntasks=1 --mem-per-cpu=1000 --parsable --oversubscribe <<'__COLLATE__' +sbatch --export=ALL -J "collate-$flowcell" \$copy_dependency -o "collate-$flowcell.o%A" -e "collate-$flowcell.e%A" --partition=$queue --cpus-per-task=1 --ntasks=1 --mem-per-cpu=4000 --parsable --oversubscribe <<'__COLLATE__' #!/bin/bash source "$STAMPIPES/scripts/sentry/sentry-lib.bash" +$(declare -f on_new_cluster) +$(declare -f set_cluster_vars) +set_cluster_vars + \$LOAD_APPTAINER cd "$analysis_dir" # Remove existing scripts if they exist (to avoid appending) -rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash +rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash run_pools.sh # Create fastqc scripts \$APX python3 "$STAMPIPES/scripts/apilaneprocess.py" \ @@ -941,8 +945,8 @@ bash fastqc.bash curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" if on_new_cluster ; then - ssh "$ALIGN_NODE" bash -c "cd \$PWD && bash run_alignments.bash" - ssh "$ALIGN_NODE" bash -c "cd \$PWD && bash run_pools.bash" + ssh "$ALIGN_NODE" bash --login "\$PWD/run_alignments.bash" + ssh "$ALIGN_NODE" bash --login "\$PWD/run_pools.bash" else # Run alignments bash run_alignments.bash From 718d54e12327c99fc4bd9bc9052b246598b8c506 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 15 Oct 2024 12:45:08 -0700 Subject: [PATCH 163/172] Fix fastqc script to run on new cluster --- processes/fastq/fastqc.bash | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/processes/fastq/fastqc.bash b/processes/fastq/fastqc.bash index 538184c5..ee47b5b1 100644 --- a/processes/fastq/fastqc.bash +++ b/processes/fastq/fastqc.bash @@ -1,8 +1,10 @@ # Dependencies [[ -s "$MODULELOAD" ]] && source "$MODULELOAD" +{ module load jdk/1.8.0_92 module load picard/2.8.1 module load fastqc/0.11.5 +} || true # ignore module load failures export FASTQ_NAME=${FLOWCELL}_${SAMPLE_NAME} export R1_FASTQ=${FASTQ_NAME}_R1.fastq.gz @@ -52,7 +54,7 @@ if [ ! -e "$R1_FASTQC" -o ! -e "$R2_FASTQC" ]; then --fastqcfile $R1_FASTQC fi - bash $STAMPIPES/scripts/fastq/attachfiles.bash + $APX bash $STAMPIPES/scripts/fastq/attachfiles.bash echo "FINISH: " date From 2d0344790643cfec12531b90a87749563503037d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 20 Oct 2024 11:09:18 -0700 Subject: [PATCH 164/172] Fix: fastqc/alignments/pools wait for collation Basically this was bash glob syntax vs. grep regex confusion. This regex was incorrectly looking for `collatefq*`, which is `collatef` followed by any number of `q`s. Adjusted to propery look for `collatefq` followed by (any number of anything) before the flowcell string. --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index e36f06d9..dec39fbb 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -922,7 +922,7 @@ rm -f fastqc.bash collate.bash run_alignments.bash run_aggregations.bash run_poo bash collate.bash # Wait for collation jobs to finish -while ( squeue -o "%j" | grep -q '^.collatefq*$flowcell') ; do +while ( squeue -o "%j" | grep -q '^.collatefq.*$flowcell') ; do sleep 60 done From 2638c0da295bcf85ed299d50db5d8f43d863fb01 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 21 Oct 2024 08:56:08 -0700 Subject: [PATCH 165/172] Alignprocess.py skips library pools This was the cause of those pesky "Project_Lab/Sample_LP.../" directories that were causing us to duplicate work. --- scripts/alignprocess.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/alignprocess.py b/scripts/alignprocess.py index 4083bbcd..ceb175c2 100755 --- a/scripts/alignprocess.py +++ b/scripts/alignprocess.py @@ -390,6 +390,12 @@ def create_script(self, processing_info, align_id): lane = processing_info["libraries"][0] alignment = [a for a in lane["alignments"] if a["id"] == align_id][0] + # Skip processing if the lane is in a library pool + # These lanes are handled by poolprocess.py + if lane.get("library_pool"): + logging.info("Alignment %d belongs to a library pool, skipping", align_id) + return + if "process_template" not in alignment: logging.error("Alignment %d has no process template", align_id) return False From 857cdc853e6e13bfa5edb924c5540781155d9caf Mon Sep 17 00:00:00 2001 From: solexa Date: Wed, 20 Nov 2024 14:03:49 -0800 Subject: [PATCH 166/172] aggregateprocess.py: fix typo-induced bug --- scripts/aggregateprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/aggregateprocess.py b/scripts/aggregateprocess.py index dbb28065..cc00936a 100755 --- a/scripts/aggregateprocess.py +++ b/scripts/aggregateprocess.py @@ -454,7 +454,7 @@ def get_example_flowcell(self, aggregation_id, aggregation_lanes): ) lane = None else: - lane = self.api.single_result(url=included["lane"]) + lane = self.api_single_result(url=included["lane"]) if not lane: logging.critical( From 30e533ba0b4c5220da405bce029971123fc9ca16 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Wed, 5 Feb 2025 10:42:51 -0800 Subject: [PATCH 167/172] fix: handle missing `project_share_directory` If this is missing, use the default analysis dir. --- scripts/flowcells/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index dec39fbb..978d77de 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -858,7 +858,7 @@ rsync -avP "$illumina_dir"/SampleSheet*.csv "$analysis_dir/" samp_number=\$(sed 's/.*DS\([0-9]*\).*/\1/' <<< "\$dir") [[ -n "\$samp_number" ]] destination=\$(jq -c -r ".libraries[] | select(.sample == \$samp_number) | .project_share_directory" ../processing.json) - if [[ -z "\$destination" ]] ; then + if [[ -z "\$destination" ]] || [[ "null" == "\$destination" ]] ; then destination=$analysis_dir elif [[ ! -d "\$destination" ]] ; then echo "Destination \$destination does not exist! Please create it." >&2 From 8fec6d1e635c1c1e52d2b3b810b934d2a08a7636 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 23 Mar 2025 12:09:29 -0700 Subject: [PATCH 168/172] Fix setup.sh for miniseq on new cluster --- .../flowcells/miniseq/example_SampleSheet.csv | 127 ++++++++++++++++++ scripts/flowcells/setup.sh | 4 +- 2 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 data/flowcells/miniseq/example_SampleSheet.csv diff --git a/data/flowcells/miniseq/example_SampleSheet.csv b/data/flowcells/miniseq/example_SampleSheet.csv new file mode 100644 index 00000000..cd637748 --- /dev/null +++ b/data/flowcells/miniseq/example_SampleSheet.csv @@ -0,0 +1,127 @@ +[Header],,,,,,,,,, +IEMFileVersion,4,,,,,,,,, +Investigator Name,Janghee,,,,,,,,, +Project Name,2015_09_30,,,,,,,,, +Experiment Name,,,,,,,,,, +Date,9/30/15,,,,,,,,, +Workflow,Resequencing,,,,,,,,, +Application,TruSeq DNA/RNA,,,,,,,,, +Assay,TruSeq DNA/RNA,,,,,,,,, +Description,,,,,,,,,, +Chemistry,Amplicon,,,,,,,,, +,,,,,,,,,, +[Reads],,,,,,,,,, +150,,,,,,,,,, +150,,,,,,,,,, +[Settings],,,,,,,,,, +OnlyGenerateFASTQ,1,,,,,,,,, +[Data],,,,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description,GenomeFolder +A1-i7-1-N501,,,,i7_1,AACGTGAT,N501,GCGATCTA,,, +B1-i7-1-N502,,,,i7_1,AACGTGAT,N502,ATAGAGAG,,, +C1-i7-1-N503,,,,i7_1,AACGTGAT,N503,AGAGGATA,,, +D1-i7-1-N504,,,,i7_1,AACGTGAT,N504,TCTACTCT,,, +E1-i7-1-N505,,,,i7_1,AACGTGAT,N505,CTCCTTAC,,, +F1-i7-1-N506,,,,i7_1,AACGTGAT,N506,TATGCAGT,,, +G1-i7-1-N507,,,,i7_1,AACGTGAT,N507,TACTCCTT,,, +H1-i7-1-N508,,,,i7_1,AACGTGAT,N508,AGGCTTAG,,, +X1-i7-1-N517,,,,i7_1,AACGTGAT,N517,TCTTACGC,,, +A2-i7-2-N501,,,,i7_2,AAACATCG,N501,GCGATCTA,,, +B2-i7-2-N502,,,,i7_2,AAACATCG,N502,ATAGAGAG,,, +C2-i7-2-N503,,,,i7_2,AAACATCG,N503,AGAGGATA,,, +D2-i7-2-N504,,,,i7_2,AAACATCG,N504,TCTACTCT,,, +E2-i7-2-N505,,,,i7_2,AAACATCG,N505,CTCCTTAC,,, +F2-i7-2-N506,,,,i7_2,AAACATCG,N506,TATGCAGT,,, +G2-i7-2-N507,,,,i7_2,AAACATCG,N507,TACTCCTT,,, +H2-i7-2-N508,,,,i7_2,AAACATCG,N508,AGGCTTAG,,, +X2-i7-2-N517,,,,i7_2,AAACATCG,N517,TCTTACGC,,, +A3-i7-3-N501,,,,i7_3,ATGCCTAA,N501,GCGATCTA,,, +B3-i7-3-N502,,,,i7_3,ATGCCTAA,N502,ATAGAGAG,,, +C3-i7-3-N503,,,,i7_3,ATGCCTAA,N503,AGAGGATA,,, +D3-i7-3-N504,,,,i7_3,ATGCCTAA,N504,TCTACTCT,,, +E3-i7-3-N505,,,,i7_3,ATGCCTAA,N505,CTCCTTAC,,, +F3-i7-3-N506,,,,i7_3,ATGCCTAA,N506,TATGCAGT,,, +G3-i7-3-N507,,,,i7_3,ATGCCTAA,N507,TACTCCTT,,, +H3-i7-3-N508,,,,i7_3,ATGCCTAA,N508,AGGCTTAG,,, +X3-i7-3-N517,,,,i7_3,ATGCCTAA,N517,TCTTACGC,,, +A4-i7-4-N501,,,,i7_4,AGTGGTCA,N501,GCGATCTA,,, +B4-i7-4-N502,,,,i7_4,AGTGGTCA,N502,ATAGAGAG,,, +C4-i7-4-N503,,,,i7_4,AGTGGTCA,N503,AGAGGATA,,, +D4-i7-4-N504,,,,i7_4,AGTGGTCA,N504,TCTACTCT,,, +E4-i7-4-N505,,,,i7_4,AGTGGTCA,N505,CTCCTTAC,,, +F4-i7-4-N506,,,,i7_4,AGTGGTCA,N506,TATGCAGT,,, +G4-i7-4-N507,,,,i7_4,AGTGGTCA,N507,TACTCCTT,,, +H4-i7-4-N508,,,,i7_4,AGTGGTCA,N508,AGGCTTAG,,, +X4-i7-4-N517,,,,i7_4,AGTGGTCA,N517,TCTTACGC,,, +A5-i7-5-N501,,,,i7_5,ACCACTGT,N501,GCGATCTA,,, +B5-i7-5-N502,,,,i7_5,ACCACTGT,N502,ATAGAGAG,,, +C5-i7-5-N503,,,,i7_5,ACCACTGT,N503,AGAGGATA,,, +D5-i7-5-N504,,,,i7_5,ACCACTGT,N504,TCTACTCT,,, +E5-i7-5-N505,,,,i7_5,ACCACTGT,N505,CTCCTTAC,,, +F5-i7-5-N506,,,,i7_5,ACCACTGT,N506,TATGCAGT,,, +G5-i7-5-N507,,,,i7_5,ACCACTGT,N507,TACTCCTT,,, +H5-i7-5-N508,,,,i7_5,ACCACTGT,N508,AGGCTTAG,,, +X5-i7-5-N517,,,,i7_5,ACCACTGT,N517,TCTTACGC,,, +A6-i7-6-N501,,,,i7_6,ACATTGGC,N501,GCGATCTA,,, +B6-i7-6-N502,,,,i7_6,ACATTGGC,N502,ATAGAGAG,,, +C6-i7-6-N503,,,,i7_6,ACATTGGC,N503,AGAGGATA,,, +D6-i7-6-N504,,,,i7_6,ACATTGGC,N504,TCTACTCT,,, +E6-i7-6-N505,,,,i7_6,ACATTGGC,N505,CTCCTTAC,,, +F6-i7-6-N506,,,,i7_6,ACATTGGC,N506,TATGCAGT,,, +G6-i7-6-N507,,,,i7_6,ACATTGGC,N507,TACTCCTT,,, +H6-i7-6-N508,,,,i7_6,ACATTGGC,N508,AGGCTTAG,,, +X6-i7-6-N517,,,,i7_6,ACATTGGC,N517,TCTTACGC,,, +A7-i7-7-N501,,,,i7_7,CAGATCTG,N501,GCGATCTA,,, +B7-i7-7-N502,,,,i7_7,CAGATCTG,N502,ATAGAGAG,,, +C7-i7-7-N503,,,,i7_7,CAGATCTG,N503,AGAGGATA,,, +D7-i7-7-N504,,,,i7_7,CAGATCTG,N504,TCTACTCT,,, +E7-i7-7-N505,,,,i7_7,CAGATCTG,N505,CTCCTTAC,,, +F7-i7-7-N506,,,,i7_7,CAGATCTG,N506,TATGCAGT,,, +G7-i7-7-N507,,,,i7_7,CAGATCTG,N507,TACTCCTT,,, +H7-i7-7-N508,,,,i7_7,CAGATCTG,N508,AGGCTTAG,,, +X7-i7-7-N517,,,,i7_7,CAGATCTG,N517,TCTTACGC,,, +A8-i7-8-N501,,,,i7_8,CATCAAGT,N501,GCGATCTA,,, +B8-i7-8-N502,,,,i7_8,CATCAAGT,N502,ATAGAGAG,,, +C8-i7-8-N503,,,,i7_8,CATCAAGT,N503,AGAGGATA,,, +D8-i7-8-N504,,,,i7_8,CATCAAGT,N504,TCTACTCT,,, +E8-i7-8-N505,,,,i7_8,CATCAAGT,N505,CTCCTTAC,,, +F8-i7-8-N506,,,,i7_8,CATCAAGT,N506,TATGCAGT,,, +G8-i7-8-N507,,,,i7_8,CATCAAGT,N507,TACTCCTT,,, +H8-i7-8-N508,,,,i7_8,CATCAAGT,N508,AGGCTTAG,,, +X8-i7-8-N517,,,,i7_8,CATCAAGT,N517,TCTTACGC,,, +A9-i7-9-N501,,,,i7_9,CGCTGATC,N501,GCGATCTA,,, +B9-i7-9-N502,,,,i7_9,CGCTGATC,N502,ATAGAGAG,,, +C9-i7-9-N503,,,,i7_9,CGCTGATC,N503,AGAGGATA,,, +D9-i7-9-N504,,,,i7_9,CGCTGATC,N504,TCTACTCT,,, +E9-i7-9-N505,,,,i7_9,CGCTGATC,N505,CTCCTTAC,,, +F9-i7-9-N506,,,,i7_9,CGCTGATC,N506,TATGCAGT,,, +G9-i7-9-N507,,,,i7_9,CGCTGATC,N507,TACTCCTT,,, +H9-i7-9-N508,,,,i7_9,CGCTGATC,N508,AGGCTTAG,,, +X9-i7-9-N517,,,,i7_9,CGCTGATC,N517,TCTTACGC,,, +A10-i7-10-N501,,,,i7_10,ACAAGCTA,N501,GCGATCTA,,, +B10-i7-10-N502,,,,i7_10,ACAAGCTA,N502,ATAGAGAG,,, +C10-i7-10-N503,,,,i7_10,ACAAGCTA,N503,AGAGGATA,,, +D10-i7-10-N504,,,,i7_10,ACAAGCTA,N504,TCTACTCT,,, +E10-i7-10-N505,,,,i7_10,ACAAGCTA,N505,CTCCTTAC,,, +F10-i7-10-N506,,,,i7_10,ACAAGCTA,N506,TATGCAGT,,, +G10-i7-10-N507,,,,i7_10,ACAAGCTA,N507,TACTCCTT,,, +H10-i7-10-N508,,,,i7_10,ACAAGCTA,N508,AGGCTTAG,,, +X10-i7-10-N517,,,,i7_10,ACAAGCTA,N517,TCTTACGC,,, +A11-i7-11-N501,,,,i7_11,CTGTAGCC,N501,GCGATCTA,,, +B11-i7-11-N502,,,,i7_11,CTGTAGCC,N502,ATAGAGAG,,, +C11-i7-11-N503,,,,i7_11,CTGTAGCC,N503,AGAGGATA,,, +D11-i7-11-N504,,,,i7_11,CTGTAGCC,N504,TCTACTCT,,, +E11-i7-11-N505,,,,i7_11,CTGTAGCC,N505,CTCCTTAC,,, +F11-i7-11-N506,,,,i7_11,CTGTAGCC,N506,TATGCAGT,,, +G11-i7-11-N507,,,,i7_11,CTGTAGCC,N507,TACTCCTT,,, +H11-i7-11-N508,,,,i7_11,CTGTAGCC,N508,AGGCTTAG,,, +X11-i7-11-N517,,,,i7_11,CTGTAGCC,N517,TCTTACGC,,, +A12-i7-12-N501,,,,i7_12,AGTACAAG,N501,GCGATCTA,,, +B12-i7-12-N502,,,,i7_12,AGTACAAG,N502,ATAGAGAG,,, +C12-i7-12-N503,,,,i7_12,AGTACAAG,N503,AGAGGATA,,, +D12-i7-12-N504,,,,i7_12,AGTACAAG,N504,TCTACTCT,,, +E12-i7-12-N505,,,,i7_12,AGTACAAG,N505,CTCCTTAC,,, +F12-i7-12-N506,,,,i7_12,AGTACAAG,N506,TATGCAGT,,, +G12-i7-12-N507,,,,i7_12,AGTACAAG,N507,TACTCCTT,,, +H12-i7-12-N508,,,,i7_12,AGTACAAG,N508,AGGCTTAG,,, +X12-i7-12-N517,,,,i7_12,AGTACAAG,N517,TCTTACGC,,, diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 978d77de..389a926b 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -540,7 +540,7 @@ case $run_type in queue="$SLOW_QUEUE" minidemux="True" # placeholder - cp /home/dchee7/projects/guide-seq/data/samplesheets/SampleSheet.csv SampleSheet.csv + cp "$STAMPIPES/data/flowcells/miniseq/example_SampleSheet.csv" SampleSheet.csv bcl_tasks=1 set +e read -d '' unaligned_command << _U_ @@ -583,7 +583,7 @@ _U_ queue="$SLOW_QUEUE" minidemux="True" # placeholder - cat /net/fileserv0/projects/vol2/dchee7/datastore/talens/sample_sheets/SampleSheet.csv > SampleSheet.csv + cp "$STAMPIPES/data/flowcells/miniseq/example_SampleSheet.csv" SampleSheet.csv #make_nextseq_samplesheet > SampleSheet.csv bcl_tasks=1 set +e From f44ddc0a5dac0287c8afa2386b597cf244743d4e Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Jul 2025 12:00:24 -0700 Subject: [PATCH 169/172] link_nextseq.py supports R3 & R4 fastq files --- scripts/flowcells/link_nextseq.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/scripts/flowcells/link_nextseq.py b/scripts/flowcells/link_nextseq.py index de1d1b68..586f7cb8 100755 --- a/scripts/flowcells/link_nextseq.py +++ b/scripts/flowcells/link_nextseq.py @@ -169,22 +169,15 @@ def main(): data = json.loads(open(poptions.processing_file, "r").read()) for lane in data["libraries"]: - create_links( - lane, - "R1", - input_dir, - poptions.output_dir, - poptions.dry_run, - merge_across_lanes=poptions.merge_across_lanes, - ) - create_links( - lane, - "R2", - input_dir, - poptions.output_dir, - poptions.dry_run, - merge_across_lanes=poptions.merge_across_lanes, - ) + for read in ["R1", "R2", "R3", "R4"]: + create_links( + lane, + read, + input_dir, + poptions.output_dir, + poptions.dry_run, + merge_across_lanes=poptions.merge_across_lanes, + ) undet_lane = { "alignments": [{"sample_name": "lane1_Undetermined_L001"}], From 3245526beaf1666eee872f2785c674456be08f18 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Sun, 6 Jul 2025 13:05:50 -0700 Subject: [PATCH 170/172] Fix collate/fastq/upload for up to 4 reads --- processes/fastq/collate_fastq.bash | 164 ++++++++++++++++++++--------- processes/fastq/fastqc.bash | 96 +++++++++++------ scripts/lims/upload_data.py | 2 +- 3 files changed, 183 insertions(+), 79 deletions(-) diff --git a/processes/fastq/collate_fastq.bash b/processes/fastq/collate_fastq.bash index 990e2fc9..0905dea8 100644 --- a/processes/fastq/collate_fastq.bash +++ b/processes/fastq/collate_fastq.bash @@ -5,7 +5,7 @@ set -e CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') -if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then +if [[ "$CLUSTER_NAME" == "altius-gene" ]]; then module load apptainer/1.3.3 echo "# Using apptainer" export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" @@ -16,37 +16,44 @@ fi cd $FASTQ_DIR +INPUT_PREFIX=$SAMPLE_NAME + FASTQ_NAME=${FLOWCELL}_${SAMPLE_NAME} echo "Collating $FASTQ_DIR/$FASTQ_NAME" -R1_NUM_FILES=$(find . -maxdepth 1 -name "${SAMPLE_NAME}_R1_???.fastq.gz" | wc -l) - -if [[ "$PAIRED" == "True" ]]; then - - R2_NUM_FILES=$(find . -maxdepth 1 -name "${SAMPLE_NAME}_R2_???.fastq.gz" | wc -l) - - if [[ "$R1_NUM_FILES" -ne "$R2_NUM_FILES" ]]; then - echo "UNEQUAL NUMBER OF FILES FOR $SAMPLE_NAME IN $FASTQ_DIR" - exit 1 - fi -fi +R1_NUM_FILES=$(find . -maxdepth 1 -name "${INPUT_PREFIX}_R1_*.fastq.gz" | wc -l) +R2_NUM_FILES=$(find . -maxdepth 1 -name "${INPUT_PREFIX}_R2_*.fastq.gz" | wc -l) +R3_NUM_FILES=$(find . -maxdepth 1 -name "${INPUT_PREFIX}_R3_*.fastq.gz" | wc -l) +R4_NUM_FILES=$(find . -maxdepth 1 -name "${INPUT_PREFIX}_R4_*.fastq.gz" | wc -l) R1_FILE=${FASTQ_NAME}_R1.fastq.gz R2_FILE=${FASTQ_NAME}_R2.fastq.gz +R3_FILE=${FASTQ_NAME}_R3.fastq.gz +R4_FILE=${FASTQ_NAME}_R4.fastq.gz function upload { - if [[ "$SAMPLE_NAME" == LP* ]] ; then + if [[ "$SAMPLE_NAME" == LP* ]]; then # Altcode sample, use dedicated script $APX python3 "$STAMPIPES/scripts/altcode/upload_fastq.py" --lane "$FLOWCELL_LANE_ID" --r1 "$R1_FILE" --r2 "$R2_FILE" else # Regular sample, upload old-style - UPLOAD_SCRIPT="$APX python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" - $UPLOAD_SCRIPT --attach_file_purpose r1-fastq --attach_file "${R1_FILE}" + UPLOAD_SCRIPT="$APX python3 $STAMPIPES/scripts/lims/upload_data.py --attach_file_contenttype SequencingData.flowcelllane --attach_file_objectid ${FLOWCELL_LANE_ID} --attach_file_type=gzipped-fastq" + if [ -e "$R1_FILE" ]; then + $UPLOAD_SCRIPT --attach_file_purpose r1-fastq --attach_file "${R1_FILE}" + fi + + if [ -e "$R2_FILE" ]; then + $UPLOAD_SCRIPT --attach_file_purpose r2-fastq --attach_file "${R2_FILE}" + fi + + if [ -e "$R3_FILE" ]; then + $UPLOAD_SCRIPT --attach_file_purpose r3-fastq --attach_file "${R3_FILE}" + fi - if [ -e "$R2_FILE" ]; then - $UPLOAD_SCRIPT --attach_file_purpose r2-fastq --attach_file "${R2_FILE}" - fi + if [ -e "$R4_FILE" ]; then + $UPLOAD_SCRIPT --attach_file_purpose r4-fastq --attach_file "${R4_FILE}" + fi fi } @@ -70,13 +77,21 @@ fi # If only one file, just mv or cp as appropriate if [ "$R1_NUM_FILES" -eq "1" ]; then cmd="mv" - if [ -n "$RETAIN_ORIGINALS" ] ; then + if [ -n "$RETAIN_ORIGINALS" ]; then cmd="cp" fi - "$cmd" "${SAMPLE_NAME}_R1_001.fastq.gz" "$R1_FILE" - if [ "$PAIRED" == "True" ] ; then - "$cmd" "${SAMPLE_NAME}_R2_001.fastq.gz" "$R2_FILE" + if [ "$R1_NUM_FILES" -gt 0 ]; then + "$cmd" "${INPUT_PREFIX}"_R1*.fastq.gz "$R1_FILE" + fi + if [ "$R2_NUM_FILES" -gt 0 ]; then + "$cmd" "${INPUT_PREFIX}"_R2*.fastq.gz "$R2_FILE" + fi + if [ "$R3_NUM_FILES" -gt 0 ]; then + "$cmd" "${INPUT_PREFIX}"_R3*.fastq.gz "$R3_FILE" + fi + if [ "$R4_NUM_FILES" -gt 0 ]; then + "$cmd" "${INPUT_PREFIX}"_R4*.fastq.gz "$R4_FILE" fi upload @@ -85,6 +100,8 @@ else R1_TMP_FILE=$(mktemp) R2_TMP_FILE=$(mktemp) + R3_TMP_FILE=$(mktemp) + R4_TMP_FILE=$(mktemp) if [ -e "$R1_FILE" ]; then rm "$R1_FILE" @@ -94,61 +111,114 @@ else rm "$R2_FILE" fi + if [ -e "$R3_FILE" ]; then + rm "$R3_FILE" + fi + + if [ -e "$R4_FILE" ]; then + rm "$R4_FILE" + fi + echo "R1: $R1_FILE" echo "R2: $R2_FILE" + echo "R3: $R3_FILE" + echo "R4: $R4_FILE" - for filenum in $(seq -f "%03g" 1 $R1_NUM_FILES) - do + for filenum in $(seq -f "%03g" 1 $R1_NUM_FILES); do echo "Adding ${filenum} to collated files" - cat ${SAMPLE_NAME}_R1_${filenum}.fastq.gz >> $R1_TMP_FILE - - if [[ "$PAIRED" == "True" ]]; then - R2_FILE=${FASTQ_NAME}_R2.fastq.gz - cat ${SAMPLE_NAME}_R2_${filenum}.fastq.gz >> $R2_TMP_FILE + if [ -e "${SAMPLE_NAME}_R1_${filenum}.fastq.gz" ]; then + cat ${SAMPLE_NAME}_R1_${filenum}.fastq.gz >>$R1_TMP_FILE + fi + if [ -e "${SAMPLE_NAME}_R2_${filenum}.fastq.gz" ]; then + cat ${SAMPLE_NAME}_R2_${filenum}.fastq.gz >>$R2_TMP_FILE + fi + if [ -e "${SAMPLE_NAME}_R3_${filenum}.fastq.gz" ]; then + cat ${SAMPLE_NAME}_R3_${filenum}.fastq.gz >>$R3_TMP_FILE + fi + if [ -e "${SAMPLE_NAME}_R4_${filenum}.fastq.gz" ]; then + cat ${SAMPLE_NAME}_R4_${filenum}.fastq.gz >>$R4_TMP_FILE fi done # Ensure we have valid gzipped files - gzip -t $R1_TMP_FILE - - if [ ! -s $R1_TMP_FILE ]; then + if [ -s $R1_TMP_FILE ]; then + gzip -t $R1_TMP_FILE + else echo "ERROR: $R1_TMP_FILE is 0 size" exit 1 fi - if [ "$PAIRED" == "True" ]; then + if [ -s $R2_TMP_FILE ]; then gzip -t $R2_TMP_FILE - if [ ! -s $R2_TMP_FILE ]; then - echo "ERROR: $R2_TMP_FILE is 0 size" - exit 1 - fi + elif [ "$R2_NUM_FILES" -gt 0 ]; then + echo "ERROR: $R2_TMP_FILE is 0 size" + exit 1 + fi + + if [ -s $R3_TMP_FILE ]; then + gzip -t $R3_TMP_FILE + elif [ "$R3_NUM_FILES" -gt 0 ]; then + echo "ERROR: $R3_TMP_FILE is 0 size" + exit 1 + fi + + if [ -s $R4_TMP_FILE ]; then + gzip -t $R4_TMP_FILE + elif [ "$R4_NUM_FILES" -gt 0 ]; then + echo "ERROR: $R4_TMP_FILE is 0 size" + exit 1 fi - rsync "$R1_TMP_FILE" "$R1_FILE" - # Files created in temp directories do not have appropriate - # permissions; make sure our collated files can be read by - # anybody - chmod 644 $R1_FILE + # Move temp files to final locations with proper permissions + if [ -s $R1_TMP_FILE ]; then + rsync "$R1_TMP_FILE" "$R1_FILE" + chmod 644 $R1_FILE + fi rm $R1_TMP_FILE - if [[ "$PAIRED" == "True" ]] ; then + + if [ -s $R2_TMP_FILE ]; then rsync "$R2_TMP_FILE" "$R2_FILE" chmod 644 $R2_FILE - rm $R2_TMP_FILE fi + rm $R2_TMP_FILE + + if [ -s $R3_TMP_FILE ]; then + rsync "$R3_TMP_FILE" "$R3_FILE" + chmod 644 $R3_FILE + fi + rm $R3_TMP_FILE + + if [ -s $R4_TMP_FILE ]; then + rsync "$R4_TMP_FILE" "$R4_FILE" + chmod 644 $R4_FILE + fi + rm $R4_TMP_FILE fi upload # Remove existing pre-collation files if [ ! -n "$RETAIN_ORIGINALS" ]; then - echo "Removing R1 originals" - rm ${SAMPLE_NAME}_R1_???.fastq.gz + if [ "$R1_NUM_FILES" -gt 0 ]; then + echo "Removing R1 originals" + rm ${SAMPLE_NAME}_R1_???.fastq.gz + fi - if [[ "$PAIRED" == "True" ]]; then + if [ "$R2_NUM_FILES" -gt 0 ]; then echo "Removing R2 originals" rm ${SAMPLE_NAME}_R2_???.fastq.gz fi + + if [ "$R3_NUM_FILES" -gt 0 ]; then + echo "Removing R3 originals" + rm ${SAMPLE_NAME}_R3_???.fastq.gz + fi + + if [ "$R4_NUM_FILES" -gt 0 ]; then + echo "Removing R4 originals" + rm ${SAMPLE_NAME}_R4_???.fastq.gz + fi fi diff --git a/processes/fastq/fastqc.bash b/processes/fastq/fastqc.bash index ee47b5b1..e55ec545 100644 --- a/processes/fastq/fastqc.bash +++ b/processes/fastq/fastqc.bash @@ -1,22 +1,29 @@ # Dependencies [[ -s "$MODULELOAD" ]] && source "$MODULELOAD" { -module load jdk/1.8.0_92 -module load picard/2.8.1 -module load fastqc/0.11.5 + module load jdk/1.8.0_92 + module load picard/2.8.1 + module load fastqc/0.11.5 } || true # ignore module load failures export FASTQ_NAME=${FLOWCELL}_${SAMPLE_NAME} + export R1_FASTQ=${FASTQ_NAME}_R1.fastq.gz export R2_FASTQ=${FASTQ_NAME}_R2.fastq.gz +export R3_FASTQ=${FASTQ_NAME}_R3.fastq.gz +export R4_FASTQ=${FASTQ_NAME}_R4.fastq.gz + export R1_FASTQC=${FASTQ_NAME}_R1_fastqc.zip export R2_FASTQC=${FASTQ_NAME}_R2_fastqc.zip +export R3_FASTQC=${FASTQ_NAME}_R3_fastqc.zip +export R4_FASTQC=${FASTQ_NAME}_R4_fastqc.zip + export TOP_UMIS=${SAMPLE_NAME}.topumis.txt.gz cd $FASTQ_DIR CLUSTER_NAME=$(scontrol show config | awk '$1 == "ClusterName" {print $3}') -if [[ "$CLUSTER_NAME" == "altius-gene" ]] ; then +if [[ "$CLUSTER_NAME" == "altius-gene" ]]; then module load apptainer/1.3.3 echo "# Using apptainer" export APX="apptainer exec --bind /net/seq/data2/sequencers,/net/seq/data2/flowcells,$STAMPIPES $STAMPIPES/containers/fastq/fastq.sif" @@ -25,39 +32,66 @@ else export APX= fi -if [ ! -e "$R1_FASTQC" -o ! -e "$R2_FASTQC" ]; then +set -x -e -o pipefail - set -x -e -o pipefail +echo "Hostname: " +hostname - echo "Hostname: " - hostname +echo "START: " +date - echo "START: " - date - - cd $FASTQ_DIR +cd $FASTQ_DIR +if [ -e "$R1_FASTQ" ]; then $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R1_FASTQ FASTQC_FILE=$R1_FASTQC - if [ "$PAIRED" = "True" ]; then - $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R2_FASTQ FASTQC_FILE=$R2_FASTQC - fi - - if [ "$UMI" = "True" ]; then - echo "Tallying up top UMI tags seen in R1" - zcat ${R1_FASTQ} | grep "^@" | cut -f 2 -d "+" | sort | uniq -c | sort -n -r | gzip -c > ${TOP_UMIS} - fi - - if [ "$PAIRED" = "True" ]; then - $APX python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ - --fastqcfile $R1_FASTQC --fastqcfile $R2_FASTQC - else - $APX python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID} \ - --fastqcfile $R1_FASTQC - fi +fi +if [ -e "$R2_FASTQ" ]; then + $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R2_FASTQ FASTQC_FILE=$R2_FASTQC +fi +if [ -e "$R3_FASTQ" ]; then + $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R3_FASTQ FASTQC_FILE=$R3_FASTQC +fi +if [ -e "$R4_FASTQ" ]; then + $APX make -f $STAMPIPES/makefiles/fastqc.mk FASTQ_FILE=$R4_FASTQ FASTQC_FILE=$R4_FASTQC +fi - $APX bash $STAMPIPES/scripts/fastq/attachfiles.bash +if [ "$UMI" = "True" ]; then + echo "Tallying up top UMI tags seen in R1" + zcat ${R1_FASTQ} | grep "^@" | cut -f 2 -d "+" | sort | uniq -c | sort -n -r | gzip -c >${TOP_UMIS} +fi - echo "FINISH: " - date +# Build upload command dynamically based on which FastQC files exist +UPLOAD_CMD="$APX python3 ${STAMPIPES}/scripts/lims/upload_data_withr3.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID}" +if [ -e "$R1_FASTQC" ]; then + UPLOAD_CMD="$UPLOAD_CMD --fastqcfile $R1_FASTQC" +fi +if [ -e "$R2_FASTQC" ]; then + UPLOAD_CMD="$UPLOAD_CMD --fastqcfile $R2_FASTQC" +fi +if [ -e "$R3_FASTQC" ]; then + UPLOAD_CMD="$UPLOAD_CMD --fastqcfile $R3_FASTQC" +fi +if [ -e "$R4_FASTQC" ]; then + UPLOAD_CMD="$UPLOAD_CMD --fastqcfile $R4_FASTQC" +fi +eval $UPLOAD_CMD +#$APX bash $STAMPIPES/scripts/fastq/attachfiles.bash +# Inline contents of the script so I can add R3 and R4: +UPLOAD_SCRIPT=$STAMPIPES/scripts/lims/upload_data_withr3.py +ATTACH_LANE="python3 $UPLOAD_SCRIPT --attach_file_contenttype SequencingData.flowcelllane --attach_file_object ${FLOWCELL_LANE_ID}" +$ATTACH_LANE --attach_directory ${FASTQ_DIR} --attach_file_purpose fastq-directory +if [ -e "$R1_FASTQC" ]; then + $ATTACH_LANE --attach_file ${R1_FASTQC} --attach_file_type zip --attach_file_purpose fastqc-results-zip +fi +if [ -e "$R2_FASTQC" ]; then + $ATTACH_LANE --attach_file ${R2_FASTQC} --attach_file_type zip --attach_file_purpose fastqc-results-zip +fi +if [ -e "$R3_FASTQC" ]; then + $ATTACH_LANE --attach_file ${R3_FASTQC} --attach_file_type zip --attach_file_purpose fastqc-results-zip +fi +if [ -e "$R4_FASTQC" ]; then + $ATTACH_LANE --attach_file ${R4_FASTQC} --attach_file_type zip --attach_file_purpose fastqc-results-zip fi +echo "FINISH: " +date diff --git a/scripts/lims/upload_data.py b/scripts/lims/upload_data.py index 3fb857cc..7cba6c13 100755 --- a/scripts/lims/upload_data.py +++ b/scripts/lims/upload_data.py @@ -992,7 +992,7 @@ def upload_fastqc(self, flowcell_lane_id, filename): self.flowcelllane_contenttype = self.get_flowcelllane_contenttype() m = re.search( - r"(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])_(?PR[12])", + r"(?P[^/]+)_(?P[AGTC-]+|NoIndex)_L00(?P[0-9])_(?PR\d)", filename, ) From d28784a10baab0ac47119262b12b2ea5d8d30425 Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Mon, 7 Jul 2025 13:09:23 -0700 Subject: [PATCH 171/172] Fixup for fastqc.bash --- processes/fastq/fastqc.bash | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/fastq/fastqc.bash b/processes/fastq/fastqc.bash index e55ec545..b73f1c10 100644 --- a/processes/fastq/fastqc.bash +++ b/processes/fastq/fastqc.bash @@ -60,7 +60,7 @@ if [ "$UMI" = "True" ]; then fi # Build upload command dynamically based on which FastQC files exist -UPLOAD_CMD="$APX python3 ${STAMPIPES}/scripts/lims/upload_data_withr3.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID}" +UPLOAD_CMD="$APX python3 ${STAMPIPES}/scripts/lims/upload_data.py -f ${FLOWCELL} --flowcell_lane_id=${FLOWCELL_LANE_ID}" if [ -e "$R1_FASTQC" ]; then UPLOAD_CMD="$UPLOAD_CMD --fastqcfile $R1_FASTQC" fi @@ -77,8 +77,8 @@ eval $UPLOAD_CMD #$APX bash $STAMPIPES/scripts/fastq/attachfiles.bash # Inline contents of the script so I can add R3 and R4: -UPLOAD_SCRIPT=$STAMPIPES/scripts/lims/upload_data_withr3.py -ATTACH_LANE="python3 $UPLOAD_SCRIPT --attach_file_contenttype SequencingData.flowcelllane --attach_file_object ${FLOWCELL_LANE_ID}" +UPLOAD_SCRIPT=$STAMPIPES/scripts/lims/upload_data.py +ATTACH_LANE="$APX python3 $UPLOAD_SCRIPT --attach_file_contenttype SequencingData.flowcelllane --attach_file_object ${FLOWCELL_LANE_ID}" $ATTACH_LANE --attach_directory ${FASTQ_DIR} --attach_file_purpose fastq-directory if [ -e "$R1_FASTQC" ]; then $ATTACH_LANE --attach_file ${R1_FASTQC} --attach_file_type zip --attach_file_purpose fastqc-results-zip From d6a371e8690dacb7318815673386d97c3a59f97d Mon Sep 17 00:00:00 2001 From: Jemma Nelson Date: Tue, 26 Aug 2025 13:02:10 -0700 Subject: [PATCH 172/172] Disable pool processing We don't use the output from this anymore, preferring to run the megamap pipeline or other analyses as appropriate. --- scripts/flowcells/setup.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/flowcells/setup.sh b/scripts/flowcells/setup.sh index 389a926b..96346b86 100644 --- a/scripts/flowcells/setup.sh +++ b/scripts/flowcells/setup.sh @@ -936,21 +936,22 @@ bash fastqc.bash --qsub-queue "$OLD_SLOW_QUEUE" \ --outfile run_alignments.bash -\$APX python3 "$STAMPIPES/scripts/poolprocess.py" \ - --flowcell "$flowcell" \ - --qsub-queue "$OLD_SLOW_QUEUE" \ - --outfile run_pools.bash +# Pool processing is disabled - subsumed by megamap pipeline +# \$APX python3 "$STAMPIPES/scripts/poolprocess.py" \ +# --flowcell "$flowcell" \ +# --qsub-queue "$OLD_SLOW_QUEUE" \ +# --outfile run_pools.bash # Set up of flowcell aggregations curl -X POST "$LIMS_API_URL/flowcell_run/$flowcell_id/autoaggregate/" -H "Authorization: Token \$LIMS_API_TOKEN" if on_new_cluster ; then ssh "$ALIGN_NODE" bash --login "\$PWD/run_alignments.bash" - ssh "$ALIGN_NODE" bash --login "\$PWD/run_pools.bash" + # ssh "$ALIGN_NODE" bash --login "\$PWD/run_pools.bash" else # Run alignments bash run_alignments.bash - bash run_pools.bash + # bash run_pools.bash fi __COLLATE__