diff --git a/config/base.config b/config/base.config index 619cd28..df760b1 100644 --- a/config/base.config +++ b/config/base.config @@ -19,6 +19,7 @@ params { uk_aligned_fasta = "test/matched3.fa" // null param so exists uk_mutations = "test/matched2.variants" // null param so exists uk_constellations = "resources/empty_constellations.csv" // null so exists + uk_pag = "test/uk_pag.tsv" //null param // if carrying forward from previous previous_metadata = "" diff --git a/environment.yml b/environment.yml index 12ed3b9..77f736d 100644 --- a/environment.yml +++ b/environment.yml @@ -17,15 +17,16 @@ dependencies: - nextflow - s3cmd - smart_open + - datafunk + - fastafunk - pip: - ftfy - geopandas - git+https://github.com/cov-lineages/pangolin.git - git+https://github.com/cov-lineages/pangoLEARN.git - - git+https://github.com/cov-ert/datafunk.git - - git+https://github.com/cov-ert/fastafunk.git - git+https://github.com/cov-lineages/constellations.git - git+https://github.com/cov-lineages/scorpio.git - git+https://github.com/cov-lineages/pango-designation.git + - git+https://github.com/cov-lineages/pangolin-assigment.git diff --git a/modules/align_and_variant_call.nf b/modules/align_and_variant_call.nf index c8ee755..d8fe713 100644 --- a/modules/align_and_variant_call.nf +++ b/modules/align_and_variant_call.nf @@ -25,7 +25,7 @@ process minimap2_to_reference { script: """ - minimap2 -t ${task.cpus} -a --secondary=no -x asm20 --score-N=0 ${reference_fasta} ${fasta} > alignment.sam + minimap2 -t ${task.cpus} -a --secondary=no --score-N=0 -x asm20 ${reference_fasta} ${fasta} > alignment.sam """ } diff --git a/modules/clean_geography.nf b/modules/clean_geography.nf index 4b79cf1..41d7939 100644 --- a/modules/clean_geography.nf +++ b/modules/clean_geography.nf @@ -203,6 +203,26 @@ process make_delta_by_utla_summary { """ } + +process drop_anon_id { + /** + * Drops anonymous ID from master metadata csv + * @input metadta + * @output metadata + */ + + input: + path metadata + + output: + path "${metadata.baseName}_anon.csv" + + script: + """ + fastafunk drop_columns --in-metadata ${metadata} --columns anonymous_sample_id --out-metadata ${metadata.baseName}_anon.csv + """ +} + process publish_master_metadata { /** * Publishes master metadata csv for this category @@ -238,7 +258,8 @@ workflow clean_geography_cog_uk { uk_geography(uk_fasta, uk_metadata) add_uk_geography_to_metadata(uk_metadata,uk_geography.out.geography) make_delta_by_utla_summary(add_uk_geography_to_metadata.out.metadata) - publish_master_metadata(add_uk_geography_to_metadata.out.metadata, "cog") + drop_anon_id(add_uk_geography_to_metadata.out.metadata) + publish_master_metadata(drop_anon_id.out, "cog") emit: metadata = add_uk_geography_to_metadata.out.metadata } diff --git a/modules/pangolin.nf b/modules/pangolin.nf index dba9b06..a1a814c 100644 --- a/modules/pangolin.nf +++ b/modules/pangolin.nf @@ -45,6 +45,7 @@ process extract_sequences_for_pangolin { * @output pangolin_fasta, metadata_with_previous * @params previous_metadata, update_all_lineage_assignments */ + memory {task.attempt * 6.GB} input: path fasta @@ -91,13 +92,15 @@ process run_pangolin { * @input fasta * @output pangolin_fasta */ + cpus 4 + memory { task.attempt * 8.GB } input: path fasta output: path "pangolin/lineage_report.csv", emit: report - path "pangolin/sequences.aln.fasta", emit: alignment + //path "pangolin/sequences.aln.fasta", emit: alignment script: if (params.skip_designation_hash) @@ -106,14 +109,18 @@ process run_pangolin { --outdir pangolin \ --tempdir pangolin_tmp \ --alignment \ - --skip-designation-hash + --analysis-mode fast \ + --skip-designation-hash \ + -t ${task.cpus} """ else """ pangolin "${fasta}" \ --outdir pangolin \ --tempdir pangolin_tmp \ - --alignment + --alignment \ + --analysis-mode fast \ + -t ${task.cpus} """ } @@ -124,7 +131,7 @@ process run_pangolin_usher { * @output pangolin_fasta */ - cpus 4 + cpus 16 input: path fasta @@ -149,8 +156,7 @@ process run_pangolin_usher { --outdir pangolin \ --tempdir pangolin_tmp \ --outfile usher_lineage_report.csv \ - --usher \ - -t ${task.cpus} + --usher -t ${task.cpus} """ } diff --git a/modules/preprocess_cog_uk.nf b/modules/preprocess_cog_uk.nf index a57a433..9ba6caa 100644 --- a/modules/preprocess_cog_uk.nf +++ b/modules/preprocess_cog_uk.nf @@ -39,6 +39,70 @@ process uk_strip_header_digits_and_unalign { """ } +process uk_add_published_date_to_metadata { + /** + * Takes the MAJORA TSV of metadata and adds the published_data parameter from + * majora.pag_lookup.tsv + * @input uk_metadata, uk_pag_metadata + * @output uk_metadata_updated_date + */ + + input: + path uk_updated_metadata + path uk_metadata_pag + + output: + path "${uk_updated_metadata.baseName}.pag.csv" + + script: + """ + fastafunk add_columns \ + --in-metadata ${uk_updated_metadata} \ + --in-data ${uk_metadata_pag} \ + --index-column central_sample_id \ + --join-on central_sample_id \ + --force-overwrite \ + --new-columns published_date \ + --out-metadata "${uk_updated_metadata.baseName}.pag.csv" + """ +} + +process uk_anonymise_ids { + /** + If on or after 30th June 2023, replace central ID + for anonymous ID, if they are present. + @input uk_metadata + @output uk_metadata_anon + */ + + input: + path uk_metadata + + output: + path "${uk_metadata.baseName}.anon.tsv" + + script: + """ + #!/usr/bin/env python3 + import datetime + import csv + + anon_samp_id_date = datetime.datetime(2023, 6, 30).date() + + with open("${uk_metadata}", 'r', newline = '') as csv_in, open("${uk_metadata.baseName}.anon.tsv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix", delimiter="\t") + writer.writeheader() + + for row in reader: + if datetime.datetime.strptime(row["published_date"], "%Y-%m-%d").date() >= anon_samp_id_date: + if row["anonymous_sample_id"]: + row["central_sample_id"] = row["anonymous_sample_id"] + writer.writerow(row) + """ +} + + process uk_add_columns_to_metadata { /** * Takes the MAJORA TSV of metadata and adds/updates columns for sample_date, pillar_2, @@ -66,6 +130,7 @@ process uk_add_columns_to_metadata { """ } + process uk_filter_omitted_sequences { /** * Takes a FASTA and METADATA and excludes samples specified in an exclusion file @@ -204,7 +269,7 @@ process add_previous_uk_lineage_to_metadata { * @output metadata */ - memory { 1.GB * task.attempt + metadata.size() * 2.B } + memory { 2.GB * task.attempt + metadata.size() * 2.B } input: path metadata @@ -280,9 +345,12 @@ workflow preprocess_cog_uk { uk_fasta uk_metadata uk_accessions + uk_pag main: uk_strip_header_digits_and_unalign(uk_fasta) - uk_add_columns_to_metadata(uk_metadata, uk_accessions, uk_updated_dates) + uk_add_published_date_to_metadata(uk_metadata, uk_pag) + uk_anonymise_ids(uk_add_published_date_to_metadata.out) + uk_add_columns_to_metadata(uk_anonymise_ids.out, uk_accessions, uk_updated_dates) uk_filter_omitted_sequences(uk_strip_header_digits_and_unalign.out, uk_add_columns_to_metadata.out, uk_omissions) uk_filter_on_sample_date(uk_filter_omitted_sequences.out.fasta, uk_filter_omitted_sequences.out.metadata) add_previous_uk_lineage_to_metadata(uk_filter_omitted_sequences.out.metadata) diff --git a/nextflow.config b/nextflow.config index d505954..f9b0d9b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,6 +20,6 @@ process { withLabel: retry_increasing_mem { errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } memory = {4.GB * task.attempt} - maxRetries = 2 + maxRetries = 5 } -} \ No newline at end of file +} diff --git a/resources/publish_cog_global_recipes.json b/resources/publish_cog_global_recipes.json index 1a6add4..a5a63d2 100644 --- a/resources/publish_cog_global_recipes.json +++ b/resources/publish_cog_global_recipes.json @@ -1,105 +1,105 @@ -{ - "alignments": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned", - "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - }, - { - "suffix": "all", - "data": "cog", - "fasta": "aligned" - }, - { - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - } - ], - "metadata": [ - { - "suffix": "public", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" - }, - { - "suffix": "consortium", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", - "mutations": true - }, - { - "suffix": "geography", - "data": "cog_global", - "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], - "where": "cog_id=central_sample_id" - }, - { - "suffix": "mutations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version"], - "mutations": true - }, - { - "suffix": "constellations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version"], - "constellations": true - }, - { - "suffix": "unlinked", - "data": "cog_global", - "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","is_surveillance", "collection_pillar", "is_pillar_2"], - "mutations": true, - "uk_only": true, - "shuffle": true, - "drop_index": "sequence_name" - }, - { - "data": "cog_global", - "suffix": "epidemiology", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" - } - ], - "public": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned" - }, - { - "data": "cog", - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" - }, - { - "suffix": "unmasked", - "data": "cog", - "fasta": "aligned" - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "cog_global", - "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","scorpio_call"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" - }, - { - "suffix": "mutations", - "data": "cog_global", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} +{ + "alignments": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned", + "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + }, + { + "suffix": "all", + "data": "cog", + "fasta": "aligned" + }, + { + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + } + ], + "metadata": [ + { + "suffix": "public", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" + }, + { + "suffix": "consortium", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", + "mutations": true + }, + { + "suffix": "geography", + "data": "cog_global", + "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], + "where": "cog_id=central_sample_id" + }, + { + "suffix": "mutations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "mutations": true + }, + { + "suffix": "constellations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "constellations": true + }, + { + "suffix": "unlinked", + "data": "cog_global", + "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","usher_lineage", "usher_lineages_version", "is_surveillance", "collection_pillar", "is_pillar_2"], + "mutations": true, + "uk_only": true, + "shuffle": true, + "drop_index": "sequence_name" + }, + { + "data": "cog_global", + "suffix": "epidemiology", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" + } + ], + "public": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned" + }, + { + "data": "cog", + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" + }, + { + "suffix": "unmasked", + "data": "cog", + "fasta": "aligned" + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "cog_global", + "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" + }, + { + "suffix": "mutations", + "data": "cog_global", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/resources/publish_gisaid_recipes.json b/resources/publish_gisaid_recipes.json index 2dfebfe..69c88da 100644 --- a/resources/publish_gisaid_recipes.json +++ b/resources/publish_gisaid_recipes.json @@ -1,56 +1,56 @@ -{ - "gisaid": [ - { - "suffix": "all", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" - }, - { - "suffix": "global", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "exclude_cog": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" - }, - { - "suffix": "global_mutations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "mutations": true, - "exclude_cog": true - }, - { - "suffix": "global_constellations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "constellations": true, - "exclude_cog": true - }, - { - "suffix": "global_updown", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "updown": true, - "exclude_cog": true - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","scorpio_call"], - "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" - }, - { - "suffix": "mutations", - "data": "gisaid", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} +{ + "gisaid": [ + { + "suffix": "all", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" + }, + { + "suffix": "global", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "exclude_cog": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" + }, + { + "suffix": "global_mutations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "mutations": true, + "exclude_cog": true + }, + { + "suffix": "global_constellations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "constellations": true, + "exclude_cog": true + }, + { + "suffix": "global_updown", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "updown": true, + "exclude_cog": true + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" + }, + { + "suffix": "mutations", + "data": "gisaid", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/workflows/process_cog_uk.nf b/workflows/process_cog_uk.nf index 67f231a..8763fb2 100644 --- a/workflows/process_cog_uk.nf +++ b/workflows/process_cog_uk.nf @@ -19,14 +19,16 @@ workflow process_cog_uk { uk_metadata uk_accessions pangolin_updated + uk_pag main: - preprocess_cog_uk(uk_fasta, uk_metadata, uk_accessions) + preprocess_cog_uk(uk_fasta, uk_metadata, uk_accessions, uk_pag) pangolin(preprocess_cog_uk.out.fasta, preprocess_cog_uk.out.metadata, pangolin_updated) deduplicate_cog_uk(preprocess_cog_uk.out.fasta, pangolin.out.metadata) align_and_variant_call(deduplicate_cog_uk.out.fasta, deduplicate_cog_uk.out.metadata, "cog") filter_and_trim_cog_uk(align_and_variant_call.out.fasta, align_and_variant_call.out.metadata) clean_geography_cog_uk(filter_and_trim_cog_uk.out.fasta, filter_and_trim_cog_uk.out.metadata) emit: + preprocess_cog_uk.out.metadata unaligned_fasta = deduplicate_cog_uk.out.fasta aligned_fasta = align_and_variant_call.out.fasta trimmed_fasta = filter_and_trim_cog_uk.out.fasta @@ -42,12 +44,14 @@ workflow { ch_uk_fasta = Channel.fromPath(params.uk_fasta) ch_uk_metadata = Channel.fromPath(params.uk_metadata) ch_uk_accessions = Channel.fromPath(params.uk_accessions) + ch_uk_pag = Channel.fromPath(params.uk_pag) check_for_pangolin_update() process_cog_uk(ch_uk_fasta, ch_uk_metadata, ch_uk_accessions, - check_for_pangolin_update.out) + check_for_pangolin_update.out, + ch_uk_pag) ch_gisaid_fasta = Channel.fromPath(params.gisaid_fasta) ch_gisaid_metadata = Channel.fromPath(params.gisaid_metadata)