From 38324396bb0a8f99cf05d73db66c8d136cc57ffe Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Mon, 7 Mar 2022 14:43:31 +0000 Subject: [PATCH 1/4] WIP: updates to handle pangolin4 --- modules/pangolin.nf | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/modules/pangolin.nf b/modules/pangolin.nf index dba9b06..3d120d1 100644 --- a/modules/pangolin.nf +++ b/modules/pangolin.nf @@ -92,12 +92,14 @@ process run_pangolin { * @output pangolin_fasta */ + cpus 4 + input: path fasta output: path "pangolin/lineage_report.csv", emit: report - path "pangolin/sequences.aln.fasta", emit: alignment + path "pangolin/alignment.fasta", emit: alignment script: if (params.skip_designation_hash) @@ -106,14 +108,18 @@ process run_pangolin { --outdir pangolin \ --tempdir pangolin_tmp \ --alignment \ - --skip-designation-hash + --skip-designation-hashi \ + -t ${task.cpus} \ + --analysis-mode fast """ else """ pangolin "${fasta}" \ --outdir pangolin \ --tempdir pangolin_tmp \ - --alignment + --alignment \ + -t ${task.cpus} \ + --analysis-mode fast """ } @@ -139,7 +145,7 @@ process run_pangolin_usher { --outdir pangolin \ --tempdir pangolin_tmp \ --outfile usher_lineage_report.csv \ - --usher \ + --analysis-mode usher \ -t ${task.cpus} \ --skip-designation-hash """ @@ -149,7 +155,7 @@ process run_pangolin_usher { --outdir pangolin \ --tempdir pangolin_tmp \ --outfile usher_lineage_report.csv \ - --usher \ + --analysis-mode usher \ -t ${task.cpus} """ } From 3faafad40452c72543f69a02d59c16e7e55408d6 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Mon, 6 Jan 2025 15:12:24 +0000 Subject: [PATCH 2/4] anon climb id --- config/base.config | 1 + environment.yml | 5 +- modules/align_and_variant_call.nf | 2 +- modules/clean_geography.nf | 23 +- modules/pangolin.nf | 22 +- modules/preprocess_cog_uk.nf | 72 +- nextflow.config | 11 +- resources/publish_cog_global_recipes.json | 210 ++--- resources/publish_gisaid_recipes.json | 112 +-- workflows/LICENSE.txt | 674 +++++++++++++++ workflows/README.md | 63 ++ workflows/bin/add_to_uk_metadata.py | 225 +++++ ...otate_with_unmapped_genome_completeness.py | 53 ++ workflows/bin/cache_pangolin_report.py | 60 ++ workflows/bin/geography_cleaning | 1 + workflows/bin/prepare_for_pangolin.py | 131 +++ workflows/bin/publish_from_config.py | 244 ++++++ workflows/bin/remove_duplicates_by_date.py | 85 ++ workflows/bin/summarise_genomes_by_utla.py | 90 ++ workflows/bin/type_aas_and_dels.py | 118 +++ .../uk_label_sourceid_duplicates_to_omit.py | 72 ++ ..._remove_duplicates_COGID_by_proportionN.py | 73 ++ workflows/config/base.config | 64 ++ workflows/environment.yml | 32 + workflows/environment.yml.old | 32 + workflows/future_improvements | 21 + workflows/modules/align_and_variant_call.nf | 560 ++++++++++++ workflows/modules/clean_geography.nf | 288 +++++++ workflows/modules/deduplicate.nf | 208 +++++ workflows/modules/filter_and_trim.nf | 242 ++++++ workflows/modules/pangolin.nf | 342 ++++++++ workflows/modules/preprocess_cog_uk.nf | 372 ++++++++ workflows/modules/preprocess_gisaid.nf | 96 +++ workflows/modules/publish_all.nf | 427 ++++++++++ workflows/modules/start.nf | 33 + workflows/nextflow.config | 32 + workflows/process_cog_uk.nf | 8 +- workflows/resources/AAs.csv | 10 + workflows/resources/MN908947.fa | 429 ++++++++++ workflows/resources/MN908947.gb | 798 ++++++++++++++++++ workflows/resources/README | 30 + workflows/resources/WH04.fa | 2 + workflows/resources/date_corrections.csv | 11 + workflows/resources/dels.csv | 2 + workflows/resources/empty_constellations.csv | 1 + workflows/resources/empty_mutations.csv | 1 + workflows/resources/empty_updown.csv | 1 + workflows/resources/gisaid_omissions.txt | 364 ++++++++ .../resources/publish_cog_global_recipes.json | 105 +++ .../resources/publish_gisaid_recipes.json | 56 ++ workflows/resources/publish_readme.txt | 36 + .../resources/resequencing_omissions.txt | 15 + 52 files changed, 6783 insertions(+), 182 deletions(-) create mode 100644 workflows/LICENSE.txt create mode 100644 workflows/README.md create mode 100755 workflows/bin/add_to_uk_metadata.py create mode 100755 workflows/bin/annotate_with_unmapped_genome_completeness.py create mode 100755 workflows/bin/cache_pangolin_report.py create mode 160000 workflows/bin/geography_cleaning create mode 100755 workflows/bin/prepare_for_pangolin.py create mode 100755 workflows/bin/publish_from_config.py create mode 100755 workflows/bin/remove_duplicates_by_date.py create mode 100755 workflows/bin/summarise_genomes_by_utla.py create mode 100755 workflows/bin/type_aas_and_dels.py create mode 100755 workflows/bin/uk_label_sourceid_duplicates_to_omit.py create mode 100755 workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py create mode 100644 workflows/config/base.config create mode 100644 workflows/environment.yml create mode 100644 workflows/environment.yml.old create mode 100644 workflows/future_improvements create mode 100644 workflows/modules/align_and_variant_call.nf create mode 100644 workflows/modules/clean_geography.nf create mode 100644 workflows/modules/deduplicate.nf create mode 100644 workflows/modules/filter_and_trim.nf create mode 100644 workflows/modules/pangolin.nf create mode 100644 workflows/modules/preprocess_cog_uk.nf create mode 100644 workflows/modules/preprocess_gisaid.nf create mode 100644 workflows/modules/publish_all.nf create mode 100644 workflows/modules/start.nf create mode 100644 workflows/nextflow.config create mode 100644 workflows/resources/AAs.csv create mode 100644 workflows/resources/MN908947.fa create mode 100644 workflows/resources/MN908947.gb create mode 100644 workflows/resources/README create mode 100644 workflows/resources/WH04.fa create mode 100644 workflows/resources/date_corrections.csv create mode 100644 workflows/resources/dels.csv create mode 100644 workflows/resources/empty_constellations.csv create mode 100644 workflows/resources/empty_mutations.csv create mode 100644 workflows/resources/empty_updown.csv create mode 100644 workflows/resources/gisaid_omissions.txt create mode 100644 workflows/resources/publish_cog_global_recipes.json create mode 100644 workflows/resources/publish_gisaid_recipes.json create mode 100644 workflows/resources/publish_readme.txt create mode 100644 workflows/resources/resequencing_omissions.txt diff --git a/config/base.config b/config/base.config index 619cd28..df760b1 100644 --- a/config/base.config +++ b/config/base.config @@ -19,6 +19,7 @@ params { uk_aligned_fasta = "test/matched3.fa" // null param so exists uk_mutations = "test/matched2.variants" // null param so exists uk_constellations = "resources/empty_constellations.csv" // null so exists + uk_pag = "test/uk_pag.tsv" //null param // if carrying forward from previous previous_metadata = "" diff --git a/environment.yml b/environment.yml index 12ed3b9..77f736d 100644 --- a/environment.yml +++ b/environment.yml @@ -17,15 +17,16 @@ dependencies: - nextflow - s3cmd - smart_open + - datafunk + - fastafunk - pip: - ftfy - geopandas - git+https://github.com/cov-lineages/pangolin.git - git+https://github.com/cov-lineages/pangoLEARN.git - - git+https://github.com/cov-ert/datafunk.git - - git+https://github.com/cov-ert/fastafunk.git - git+https://github.com/cov-lineages/constellations.git - git+https://github.com/cov-lineages/scorpio.git - git+https://github.com/cov-lineages/pango-designation.git + - git+https://github.com/cov-lineages/pangolin-assigment.git diff --git a/modules/align_and_variant_call.nf b/modules/align_and_variant_call.nf index c8ee755..d8fe713 100644 --- a/modules/align_and_variant_call.nf +++ b/modules/align_and_variant_call.nf @@ -25,7 +25,7 @@ process minimap2_to_reference { script: """ - minimap2 -t ${task.cpus} -a --secondary=no -x asm20 --score-N=0 ${reference_fasta} ${fasta} > alignment.sam + minimap2 -t ${task.cpus} -a --secondary=no --score-N=0 -x asm20 ${reference_fasta} ${fasta} > alignment.sam """ } diff --git a/modules/clean_geography.nf b/modules/clean_geography.nf index 4b79cf1..41d7939 100644 --- a/modules/clean_geography.nf +++ b/modules/clean_geography.nf @@ -203,6 +203,26 @@ process make_delta_by_utla_summary { """ } + +process drop_anon_id { + /** + * Drops anonymous ID from master metadata csv + * @input metadta + * @output metadata + */ + + input: + path metadata + + output: + path "${metadata.baseName}_anon.csv" + + script: + """ + fastafunk drop_columns --in-metadata ${metadata} --columns anonymous_sample_id --out-metadata ${metadata.baseName}_anon.csv + """ +} + process publish_master_metadata { /** * Publishes master metadata csv for this category @@ -238,7 +258,8 @@ workflow clean_geography_cog_uk { uk_geography(uk_fasta, uk_metadata) add_uk_geography_to_metadata(uk_metadata,uk_geography.out.geography) make_delta_by_utla_summary(add_uk_geography_to_metadata.out.metadata) - publish_master_metadata(add_uk_geography_to_metadata.out.metadata, "cog") + drop_anon_id(add_uk_geography_to_metadata.out.metadata) + publish_master_metadata(drop_anon_id.out, "cog") emit: metadata = add_uk_geography_to_metadata.out.metadata } diff --git a/modules/pangolin.nf b/modules/pangolin.nf index 3d120d1..a1a814c 100644 --- a/modules/pangolin.nf +++ b/modules/pangolin.nf @@ -45,6 +45,7 @@ process extract_sequences_for_pangolin { * @output pangolin_fasta, metadata_with_previous * @params previous_metadata, update_all_lineage_assignments */ + memory {task.attempt * 6.GB} input: path fasta @@ -91,15 +92,15 @@ process run_pangolin { * @input fasta * @output pangolin_fasta */ - cpus 4 + memory { task.attempt * 8.GB } input: path fasta output: path "pangolin/lineage_report.csv", emit: report - path "pangolin/alignment.fasta", emit: alignment + //path "pangolin/sequences.aln.fasta", emit: alignment script: if (params.skip_designation_hash) @@ -108,9 +109,9 @@ process run_pangolin { --outdir pangolin \ --tempdir pangolin_tmp \ --alignment \ - --skip-designation-hashi \ - -t ${task.cpus} \ - --analysis-mode fast + --analysis-mode fast \ + --skip-designation-hash \ + -t ${task.cpus} """ else """ @@ -118,8 +119,8 @@ process run_pangolin { --outdir pangolin \ --tempdir pangolin_tmp \ --alignment \ - -t ${task.cpus} \ - --analysis-mode fast + --analysis-mode fast \ + -t ${task.cpus} """ } @@ -130,7 +131,7 @@ process run_pangolin_usher { * @output pangolin_fasta */ - cpus 4 + cpus 16 input: path fasta @@ -145,7 +146,7 @@ process run_pangolin_usher { --outdir pangolin \ --tempdir pangolin_tmp \ --outfile usher_lineage_report.csv \ - --analysis-mode usher \ + --usher \ -t ${task.cpus} \ --skip-designation-hash """ @@ -155,8 +156,7 @@ process run_pangolin_usher { --outdir pangolin \ --tempdir pangolin_tmp \ --outfile usher_lineage_report.csv \ - --analysis-mode usher \ - -t ${task.cpus} + --usher -t ${task.cpus} """ } diff --git a/modules/preprocess_cog_uk.nf b/modules/preprocess_cog_uk.nf index a57a433..9ba6caa 100644 --- a/modules/preprocess_cog_uk.nf +++ b/modules/preprocess_cog_uk.nf @@ -39,6 +39,70 @@ process uk_strip_header_digits_and_unalign { """ } +process uk_add_published_date_to_metadata { + /** + * Takes the MAJORA TSV of metadata and adds the published_data parameter from + * majora.pag_lookup.tsv + * @input uk_metadata, uk_pag_metadata + * @output uk_metadata_updated_date + */ + + input: + path uk_updated_metadata + path uk_metadata_pag + + output: + path "${uk_updated_metadata.baseName}.pag.csv" + + script: + """ + fastafunk add_columns \ + --in-metadata ${uk_updated_metadata} \ + --in-data ${uk_metadata_pag} \ + --index-column central_sample_id \ + --join-on central_sample_id \ + --force-overwrite \ + --new-columns published_date \ + --out-metadata "${uk_updated_metadata.baseName}.pag.csv" + """ +} + +process uk_anonymise_ids { + /** + If on or after 30th June 2023, replace central ID + for anonymous ID, if they are present. + @input uk_metadata + @output uk_metadata_anon + */ + + input: + path uk_metadata + + output: + path "${uk_metadata.baseName}.anon.tsv" + + script: + """ + #!/usr/bin/env python3 + import datetime + import csv + + anon_samp_id_date = datetime.datetime(2023, 6, 30).date() + + with open("${uk_metadata}", 'r', newline = '') as csv_in, open("${uk_metadata.baseName}.anon.tsv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix", delimiter="\t") + writer.writeheader() + + for row in reader: + if datetime.datetime.strptime(row["published_date"], "%Y-%m-%d").date() >= anon_samp_id_date: + if row["anonymous_sample_id"]: + row["central_sample_id"] = row["anonymous_sample_id"] + writer.writerow(row) + """ +} + + process uk_add_columns_to_metadata { /** * Takes the MAJORA TSV of metadata and adds/updates columns for sample_date, pillar_2, @@ -66,6 +130,7 @@ process uk_add_columns_to_metadata { """ } + process uk_filter_omitted_sequences { /** * Takes a FASTA and METADATA and excludes samples specified in an exclusion file @@ -204,7 +269,7 @@ process add_previous_uk_lineage_to_metadata { * @output metadata */ - memory { 1.GB * task.attempt + metadata.size() * 2.B } + memory { 2.GB * task.attempt + metadata.size() * 2.B } input: path metadata @@ -280,9 +345,12 @@ workflow preprocess_cog_uk { uk_fasta uk_metadata uk_accessions + uk_pag main: uk_strip_header_digits_and_unalign(uk_fasta) - uk_add_columns_to_metadata(uk_metadata, uk_accessions, uk_updated_dates) + uk_add_published_date_to_metadata(uk_metadata, uk_pag) + uk_anonymise_ids(uk_add_published_date_to_metadata.out) + uk_add_columns_to_metadata(uk_anonymise_ids.out, uk_accessions, uk_updated_dates) uk_filter_omitted_sequences(uk_strip_header_digits_and_unalign.out, uk_add_columns_to_metadata.out, uk_omissions) uk_filter_on_sample_date(uk_filter_omitted_sequences.out.fasta, uk_filter_omitted_sequences.out.metadata) add_previous_uk_lineage_to_metadata(uk_filter_omitted_sequences.out.metadata) diff --git a/nextflow.config b/nextflow.config index d505954..0909731 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,6 +20,13 @@ process { withLabel: retry_increasing_mem { errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } memory = {4.GB * task.attempt} - maxRetries = 2 + maxRetries = 5 } -} \ No newline at end of file +} + +profiles { + slurm { + process.executor = 'slurm' + process.clusterOptions='--account=lomannj-covid-19-realtime-epidemiology --qos=lomannj --time 600:0 --nodes 1' + } +} diff --git a/resources/publish_cog_global_recipes.json b/resources/publish_cog_global_recipes.json index 1a6add4..a5a63d2 100644 --- a/resources/publish_cog_global_recipes.json +++ b/resources/publish_cog_global_recipes.json @@ -1,105 +1,105 @@ -{ - "alignments": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned", - "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - }, - { - "suffix": "all", - "data": "cog", - "fasta": "aligned" - }, - { - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - } - ], - "metadata": [ - { - "suffix": "public", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" - }, - { - "suffix": "consortium", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", - "mutations": true - }, - { - "suffix": "geography", - "data": "cog_global", - "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], - "where": "cog_id=central_sample_id" - }, - { - "suffix": "mutations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version"], - "mutations": true - }, - { - "suffix": "constellations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version"], - "constellations": true - }, - { - "suffix": "unlinked", - "data": "cog_global", - "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","is_surveillance", "collection_pillar", "is_pillar_2"], - "mutations": true, - "uk_only": true, - "shuffle": true, - "drop_index": "sequence_name" - }, - { - "data": "cog_global", - "suffix": "epidemiology", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" - } - ], - "public": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned" - }, - { - "data": "cog", - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" - }, - { - "suffix": "unmasked", - "data": "cog", - "fasta": "aligned" - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "cog_global", - "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","scorpio_call"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" - }, - { - "suffix": "mutations", - "data": "cog_global", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} +{ + "alignments": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned", + "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + }, + { + "suffix": "all", + "data": "cog", + "fasta": "aligned" + }, + { + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + } + ], + "metadata": [ + { + "suffix": "public", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" + }, + { + "suffix": "consortium", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", + "mutations": true + }, + { + "suffix": "geography", + "data": "cog_global", + "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], + "where": "cog_id=central_sample_id" + }, + { + "suffix": "mutations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "mutations": true + }, + { + "suffix": "constellations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "constellations": true + }, + { + "suffix": "unlinked", + "data": "cog_global", + "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","usher_lineage", "usher_lineages_version", "is_surveillance", "collection_pillar", "is_pillar_2"], + "mutations": true, + "uk_only": true, + "shuffle": true, + "drop_index": "sequence_name" + }, + { + "data": "cog_global", + "suffix": "epidemiology", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" + } + ], + "public": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned" + }, + { + "data": "cog", + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" + }, + { + "suffix": "unmasked", + "data": "cog", + "fasta": "aligned" + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "cog_global", + "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" + }, + { + "suffix": "mutations", + "data": "cog_global", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/resources/publish_gisaid_recipes.json b/resources/publish_gisaid_recipes.json index 2dfebfe..69c88da 100644 --- a/resources/publish_gisaid_recipes.json +++ b/resources/publish_gisaid_recipes.json @@ -1,56 +1,56 @@ -{ - "gisaid": [ - { - "suffix": "all", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" - }, - { - "suffix": "global", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "exclude_cog": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" - }, - { - "suffix": "global_mutations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "mutations": true, - "exclude_cog": true - }, - { - "suffix": "global_constellations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "constellations": true, - "exclude_cog": true - }, - { - "suffix": "global_updown", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "updown": true, - "exclude_cog": true - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","scorpio_call"], - "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" - }, - { - "suffix": "mutations", - "data": "gisaid", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} +{ + "gisaid": [ + { + "suffix": "all", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" + }, + { + "suffix": "global", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "exclude_cog": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" + }, + { + "suffix": "global_mutations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "mutations": true, + "exclude_cog": true + }, + { + "suffix": "global_constellations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "constellations": true, + "exclude_cog": true + }, + { + "suffix": "global_updown", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "updown": true, + "exclude_cog": true + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" + }, + { + "suffix": "mutations", + "data": "gisaid", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/workflows/LICENSE.txt b/workflows/LICENSE.txt new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/workflows/LICENSE.txt @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/workflows/README.md b/workflows/README.md new file mode 100644 index 0000000..e40ef34 --- /dev/null +++ b/workflows/README.md @@ -0,0 +1,63 @@ +# Datapipe + +Pipeline to process SARS-CoV-2 sequences and metadata, clean up irregularities, align and variant call then publish matched subsets of FASTA sequences and metadata for groups with different access to sensitive data. + +Runs weekly on global sequences downloaded from GISAID. + +Runs daily on COG-UK sequences, and combines with non-UK GISAID sequences. + +### Install and run + git clone --recurse-submodules https://github.com/COG-UK/grapevine_nextflow.git + cd grapevine_nextflow + conda env create -f environment.yml + conda activate grapevine_nextflow + + NXF_VER=20.10.0 nextflow run workflows/process_cog_uk.nf + +### Pipeline Overview + +#### GISAID processing + +1. Parse GISAID dump (`export.json`) and extract FASTA of sequences and associated metadata. + + - Excludes known problematic sequences listed in `gisaid_omissions.txt` + + - Excludes sequences where `covv_host.lower() != 'human'` + - Excludes sequences where malformed (not `YYYY-MM-DD`) or impossible (earlier than `2019-11-30` or later than today) date in `covv_collection_date` + - Reformat FASTA header + - Add `epi-week` and `epi-day` columns to metadata + +2. Run `pangolin` (https://github.com/cov-lineages/pangolin) on all new sequences. If new release of `pangolin` run on all sequences. +3. Calculate the `unmapped_genome_completeness` as the proportion of sequence length which is unambiguous (not `N`) +4. Deduplicate by date, keeping the earliest example +5. Align to the reference (`Wuhan/WH04/2020`) with `minimap2` +6. Variant call using `gofasta` and type specific mutations of interest listed in `AAs.csv` and `dels.csv` +7. Filter out low quality sequences with mapped completeness < 93%, and trim and pad alignment outside of reference coordinates `265:29674` +8. Calculate distance to reference and exclude sequences with distance to more than 4.0 epi-week std devs. + +#### COG-UK processing + +1. Parse matched FASTA and metadata TSV output by Elan/Majora + + - Reformats header and unaligns sequences which have already been aligned to the reference + + - Manual date correction for samples listed in `date_corrections.csv` + - Excludes early sequences which have been resequenced as listed in `resequencing_omissions.txt` + - Adds GISAID accession if recently submitted + + - Excludes sequences where malformed (not `YYYY-MM-DD`) or impossible (earlier than `2019-11-30` or later than today) date in `covv_collection_date` + - Add `epi-week` and `epi-day`, `source_id` and `pillar_2` columns to metadata + +2. Run `pangolin` (https://github.com/cov-lineages/pangolin) on all new sequences. If new release of `pangolin` run on all sequences. +3. Calculate the `unmapped_genome_completeness` as the proportion of sequence length which is unambiguous (not `N`) +4. Deduplicate COG-ID by completeness and label samples with duplicate `source_id` +5. Align to the reference (`Wuhan/WH04/2020`) with `minimap2` +6. Variant call using `gofasta` and type specific mutations of interest listed in `AAs.csv` and `dels.csv` +7. Filter out low quality sequences with mapped completeness < 93%, and trim and pad alignment outside of reference coordinates `265:29674` +8. Clean up geographical metadata (https://github.com/COG-UK/geography_cleaning) +9. Combine COG-UK sequences and metadata with non-UK GISAID sequences and metadata +10. Publish subsets of the data as described in `publish_cog_global_recipes.json` + +### What is grapevine? + +`grapevine` (https://github.com/COG-UK/grapevine) was the name of the original pipeline which did all of the above, made phylogenetic trees and more. As the number of sequences has grown the tree building steps take increasingly long to complete. As the majority of users only interact with the alignments and cleaned metadata, it was decided that a robust implementation of the alignment and metadata processing steps run daily would be more useful and that is what is provided here. diff --git a/workflows/bin/add_to_uk_metadata.py b/workflows/bin/add_to_uk_metadata.py new file mode 100755 index 0000000..e98f7a4 --- /dev/null +++ b/workflows/bin/add_to_uk_metadata.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import csv +from itertools import chain +from epiweeks import Week,Year +from datetime import datetime + +adm1a_to_country = {"UK-SCT": "Scotland", + "UK-WLS": "Wales", + "UK-ENG": "England", + "UK-NIR": "Northern_Ireland", + "FK": "Falkland_Islands", + "GI": "Gibraltar", + "JE": "Jersey", + "IM": "Isle_of_Man", + "GG": "Guernsey" + } + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + parser.add_argument('--accession-file', dest = 'accession_file', required=False, help='TSV of accession') + parser.add_argument('--updated-date-file', dest = 'updated_date_file', required=False, help='CSV of date corrections') + parser.add_argument('--log-file', dest = 'log_file', required=False, help='Log file') + + args = parser.parse_args() + + return args + +def load_updated_dates(updated_date_file): + date_dict = {} + if updated_date_file: + with open(updated_date_file, 'r', newline = '') as dates_in: + reader = csv.DictReader(dates_in, delimiter=",", quotechar='\"', dialect = "unix") + for row in reader: + date_dict[row["central_sample_id"]] = row["sample_date"] + return date_dict + +def add_sample_date(row, date_dict): + if row["central_sample_id"] in date_dict: + row["sample_date"] = date_dict[row["central_sample_id"]] + return + try: + date = datetime.strptime(row["collection_date"], '%Y-%m-%d').date() + row["sample_date"] = row["collection_date"] + except: + try: + date = datetime.strptime(row["received_date"], '%Y-%m-%d').date() + row["sample_date"] = row["received_date"] + except: + row["sample_date"] = "" + +def add_source_id(row): + row["source_id"] = row["biosample_source_id"] + if row["root_biosample_source_id"] not in [None,""]: + row["source_id"] = row["root_biosample_source_id"] + if len(row["source_id"]) < 3: + row["source_id"] = None + +def add_pillar_2(row): + if row['collection_pillar'] in [2,"2"] or row['central_sample_id'][0:4] in ["ALDP", "CAMC", "MILK", "QEUH","RAND"]: + row["is_pillar_2"] = "Y" + else: + row["is_pillar_2"] = "N" + +def add_sequence_name(row): + country = adm1a_to_country[row['adm1']] + id = row['central_sample_id'] + year = str(row['sample_date']).split("-")[0] + name = country + "/" + id + "/" + year + + row["sequence_name"] = name + +def load_accession(accession_file, log_handle): + if not accession_file: + return {} + + accession_dict = {} + + with open(str(accession_file), 'r', newline = '') as acc_in: + reader = csv.DictReader(acc_in, delimiter="\t", quotechar='\"', dialect = "unix") + for row in reader: + central_sample_id = row["central_sample_id"] + run_name = row["run_name"] + gisaid_accession = row["gisaid.accession"] + + if central_sample_id in accession_dict: + if run_name in accession_dict[central_sample_id]: + log_handle.write(f'duplicate central_sample_id * run_name in accession list: {central_sample_id} {run_name}\n') + continue + accession_dict[central_sample_id][run_name] = gisaid_accession + else: + accession_dict[central_sample_id] = {run_name: gisaid_accession} + return accession_dict + +def add_covv_accession_id(row, accession_dict): + acc = "" + if row["central_sample_id"] in accession_dict: + if row["run_name"] in accession_dict[row["central_sample_id"]]: + acc = accession_dict[row["central_sample_id"]][row["run_name"]] + + row["covv_accession_id"] = acc + +def date_string_to_epi_week(date_string): + """ + parse a date string in YYYY-MM-DD format and return + cumulative epi week which is cumulative total epidemiological + weeks since 2019-12-22. Week beginning 2019-12-22 is week 0 + """ + try: + date = datetime.strptime(date_string, '%Y-%m-%d').date() + except: + return "" + # this is epi-week: + week = Week.fromdate(date) + if week.year < 2019 or (week.year == 2019 and week.week < 52): + return "" + elif week.year == 2019: + return("0") + else: + cum_epi_week = week.week + len(list(chain(*[[x for x in Year(y).iterweeks()] for y in range(2020, week.year)]))) + return str(cum_epi_week) + +def date_string_to_epi_day(date_string): + """ + parse a date string in YYYY-MM-DD format and return + cumulative epi day which is cumulative total days since 2019-12-22 + """ + try: + date = datetime.strptime(date_string, '%Y-%m-%d').date() + except: + return "" + # this is epi-week week: + week = Week.fromdate(date) + # this is day 1 of epi-week 0: + day_one = datetime.strptime("2019-12-22", '%Y-%m-%d').date() + if week.year < 2019 or (week.year == 2019 and week.week < 52): + return "" + else: + cum_epi_day = (date - day_one).days + 1 + return str(cum_epi_day) + +def date_string_to_safe_date_string(date_string): + """ + parse a date string in YYYY-MM-DD format and return + date corresponding to the start of the epi-week in which it falls. + Week beginning 2019-12-22 is week 0 + """ + try: + date = datetime.strptime(date_string, '%Y-%m-%d').date() + except: + return "" + # this is epi-week: + week = Week.fromdate(date) + + if week.year < 2019 or (week.year == 2019 and week.week < 52): + return "" + else: + return week.startdate().strftime('%Y-%m-%d') + +def add_epi_week_and_day(row): + date_str = row["sample_date"] + epi_week = date_string_to_epi_week(date_str) + epi_day = date_string_to_epi_day(date_str) + safe_date = date_string_to_safe_date_string(date_str) + + row["edin_epi_week"] = epi_week + row["edin_epi_day"] = epi_day + row["safe_sample_date"] = safe_date + +def United_Kingdom_to_UK(row): + row["adm0"] = row["adm0"].replace("United Kingdom", "UK") + +def add_uk_columns(row): + row["is_cog_uk"] = "Y" + country = adm1a_to_country[row['adm1']] + if country in ['England', 'Scotland', 'Wales', 'Northern_Ireland']: + row["is_uk"] = "Y" + else: + row["is_uk"] = "N" + +def main(): + args = parse_args() + if args.log_file: + log_handle = open(args.log_file, 'w') + else: + log_handle = sys.stdout + + date_dict = load_updated_dates(args.updated_date_file) + accession_dict = load_accession(args.accession_file, log_handle) + new_columns = ["sample_date", "source_id", "is_pillar_2", "sequence_name", "covv_accession_id", "edin_epi_week", "edin_epi_day", "safe_sample_date", "is_uk", "is_cog_uk", "why_excluded"] + + with open(args.in_metadata, 'r', newline = '') as csv_in, \ + open(args.out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter="\t", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + new_columns, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + try: + add_sample_date(row, date_dict) + add_source_id(row) + add_pillar_2(row) + add_sequence_name(row) + add_covv_accession_id(row, accession_dict) + add_epi_week_and_day(row) + United_Kingdom_to_UK(row) + row["why_excluded"] = "" + add_uk_columns(row) + writer.writerow(row) + except: + log_handle.write(f"Error updating metadata for row") + log_handle.write(str(row)) + sys.exit("Could not update metadata for row, check metadata fields") + + + log_handle.close() + +if __name__ == '__main__': + main() diff --git a/workflows/bin/annotate_with_unmapped_genome_completeness.py b/workflows/bin/annotate_with_unmapped_genome_completeness.py new file mode 100755 index 0000000..3ebeac8 --- /dev/null +++ b/workflows/bin/annotate_with_unmapped_genome_completeness.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from Bio import SeqIO +import csv + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') + + args = parser.parse_args() + + return args + +def run(in_fasta, in_metadata, out_metadata): + alignment = SeqIO.index(in_fasta, "fasta") + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["unmapped_genome_completeness"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + id_key = "fasta_header" + if "edin_header" in reader.fieldnames: + id_key = "edin_header" + + for row in reader: + id = row[id_key] + if id in alignment: + seq = str(alignment[id].seq) + if len(seq) == 0: + print(id) + row["unmapped_genome_completeness"] = 0.0 + else: + completeness = float(len(seq.replace("N", "")) / len(seq)) + row["unmapped_genome_completeness"] = completeness + writer.writerow(row) + else: + row["unmapped_genome_completeness"] = 0.0 + writer.writerow(row) + +def main(): + args = parse_args() + run(args.in_fasta, args.in_metadata, args.out_metadata) + +if __name__ == '__main__': + main() diff --git a/workflows/bin/cache_pangolin_report.py b/workflows/bin/cache_pangolin_report.py new file mode 100755 index 0000000..a356314 --- /dev/null +++ b/workflows/bin/cache_pangolin_report.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import argparse +import csv +from Bio import SeqIO +import hashlib + + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='Lineage report from pangolin') + parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='Unaligned fasta') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='Hashed lineage report from pangolin') + + args = parser.parse_args() + + return args + +def get_hash_string(record): + seq = str(record.seq).upper().encode() + hash_object = hashlib.md5(seq) + hash_string = hash_object.hexdigest() + return hash_string + +def cache_report(in_fasta, in_metadata, out_metadata): + hashed_seqs = set() + records = SeqIO.index(in_fasta, "fasta") + index_column = "taxon" + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + fieldnames = reader.fieldnames[:] + fieldnames.remove(index_column) + print(fieldnames) + writer = csv.DictWriter(csv_out, fieldnames = ["hash"] + fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + print(row) + if row[index_column] not in records: + continue + record = records[row[index_column]] + hash = get_hash_string(record) + if hash not in hashed_seqs: + del row[index_column] + row["hash"] = hash + hashed_seqs.add(hash) + writer.writerow(row) + + +def main(): + args = parse_args() + cache_report(args.in_fasta, args.in_metadata, args.out_metadata) + + +if __name__ == '__main__': + main() diff --git a/workflows/bin/geography_cleaning b/workflows/bin/geography_cleaning new file mode 160000 index 0000000..416df2f --- /dev/null +++ b/workflows/bin/geography_cleaning @@ -0,0 +1 @@ +Subproject commit 416df2f4cb1561de7f16483d17bd2990ef148ec0 diff --git a/workflows/bin/prepare_for_pangolin.py b/workflows/bin/prepare_for_pangolin.py new file mode 100755 index 0000000..2ffec46 --- /dev/null +++ b/workflows/bin/prepare_for_pangolin.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +from Bio import SeqIO +import csv +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description="""Split in fasta and metadata into lineageless for pangolin and those with a lineage""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-fasta', dest = 'in_fasta', required=False, default=None, help='Aligned FASTA') + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='CSV of metadata') + parser.add_argument('--previous-metadata', dest = 'previous_metadata', required=True, help='CSV of from previous run') + parser.add_argument('--out-fasta', dest = 'out_fasta', required=False, default=None, help='FASTA to write out') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV of metadata') + + args = parser.parse_args() + return args + +def prepare_for_pangolin(in_fasta, in_metadata, previous_metadata, out_fasta, out_metadata): + print(in_fasta, in_metadata, previous_metadata, out_fasta, out_metadata) + if in_fasta: + alignment = SeqIO.index(in_fasta, "fasta") + else: + alignment = None + + taxon = "taxon" + keys = {"lineage": "lineage", + "lineages_version": "version", + "lineage_conflict": "conflict", + "lineage_ambiguity_score": "ambiguity_score", + "pangolin_version": "pangolin_version", + "pangoLEARN_version": "pangoLEARN_version", + "scorpio_call":"scorpio_call", + "scorpio_support":"scorpio_support", + "scorpio_conflict":"scorpio_conflict", + "usher_lineage":"usher_lineage", + "usher_lineages_version": "usher_lineages_version"} + lineage_dict = {} + + with open(previous_metadata, 'r', newline = '') as lineages_in: + reader = csv.DictReader(lineages_in, delimiter=",", quotechar='\"', dialect = "unix") + + if "fasta_header" in reader.fieldnames: + taxon = "fasta_header" + elif "edin_header" in reader.fieldnames: + taxon = "edin_header" + elif "sequence_name" in reader.fieldnames: + taxon = "sequence_name" + + if "lineages_version" in reader.fieldnames: + keys["lineages_version"] = "lineages_version" + elif "version" in reader.fieldnames: + keys["lineages_version"] = "version" + elif "pangoLEARN_version" in reader.fieldnames: + keys["lineages_version"] = "pangoLEARN_version" + + if "lineage_conflict" in reader.fieldnames: + keys["lineage_conflict"] = "lineage_conflict" + if "lineage_ambiguity_score" in reader.fieldnames: + keys["lineage_ambiguity_score"] = "lineage_ambiguity_score" + + for row in reader: + if row[taxon] in lineage_dict: + print("%s occurs more than once in lineages input file" % row[taxon]) + continue + lineage_dict[row[taxon]] = {} + for key in keys: + value = keys[key] + if value in row: + lineage_dict[row[taxon]][key] = row[value] + + + if out_fasta: + fasta_out = open(out_fasta, 'w') + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + fieldnames = reader.fieldnames + print(fieldnames, len(fieldnames)) + if len(fieldnames) <= 1: + csv_in.close() + csv_in = open(in_metadata, 'r', newline = '') + reader = csv.DictReader(csv_in, delimiter="\t", quotechar='\"', dialect = "unix") + fieldnames = reader.fieldnames + fieldnames.extend([key for key in keys if key not in fieldnames]) + writer = csv.DictWriter(csv_out, fieldnames = fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + taxon = "taxon" + if "fasta_header" in reader.fieldnames: + taxon = "fasta_header" + elif "edin_header" in reader.fieldnames: + taxon = "edin_header" + elif "sequence_name" in reader.fieldnames: + taxon = "sequence_name" + print(taxon) + print(reader.fieldnames) + + missing_lineage = 0 + + for row in reader: + for key in keys: + if key not in row: + row[key] = None + + fasta_header = row[taxon] + + if fasta_header in lineage_dict: + row.update(lineage_dict[fasta_header]) + elif alignment and fasta_out and fasta_header in alignment: + seqrec = alignment[fasta_header] + fasta_out.write(">" + seqrec.id + "\n") + fasta_out.write(str(seqrec.seq) + "\n") + if not row["lineage"]: + missing_lineage += 1 + writer.writerow(row) + + if out_fasta: + fasta_out.close() + + with open("pango.log", "w") as f: + f.write("Number of sequences missing lineage assignments: %i" %missing_lineage) + +def main(): + args = parse_args() + print(args) + prepare_for_pangolin(args.in_fasta, args.in_metadata, args.previous_metadata, args.out_fasta, args.out_metadata) + +if __name__ == '__main__': + main() diff --git a/workflows/bin/publish_from_config.py b/workflows/bin/publish_from_config.py new file mode 100755 index 0000000..957fdc4 --- /dev/null +++ b/workflows/bin/publish_from_config.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 + +import argparse +import json +import subprocess +import os +import sys +import glob + +class Error (Exception): pass + +def parse_args(): + parser = argparse.ArgumentParser(description="""Create published files from config file""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--unaligned_fasta', dest = 'unaligned_fasta', required=False, help='Raw FASTA') + parser.add_argument('--aligned_fasta', dest = 'aligned_fasta', required=False, help='Aligned, masked, untrimmed FASTA') + parser.add_argument('--trimmed_fasta', dest = 'trimmed_fasta', required=False, help='Aligned, masked, trimmed and filtered FASTA') + parser.add_argument('--gisaid_fasta', dest = 'global_fasta', required=False, help='GISAID aligned FASTA') + parser.add_argument('--cog_global_fasta', dest = 'cog_global_fasta', required=False, help='COG GISAID aligned FASTA') + + parser.add_argument('--cog_metadata', dest = 'cog_metadata', required=False, help='MASSIVE CSV') + parser.add_argument('--gisaid_metadata', dest = 'global_metadata', required=False, help='MASSIVE CSV') + parser.add_argument('--cog_global_metadata', dest = 'cog_global_metadata', required=False, help='MASSIVE CSV') + + parser.add_argument('--mutations', dest = 'mutations', required=False, help='Mutations CSV') + parser.add_argument('--constellations', dest = 'constellations', required=False, help='Constellations CSV') + parser.add_argument('--updown', dest = 'updown', required=False, help='Updown output CSV') + + parser.add_argument('--recipes', dest = 'recipes', required=True, help='JSON of recipes') + parser.add_argument('--date', dest = 'date', required=True, help='Datestamp for published files') + + args = parser.parse_args() + return args + +#"data": "cog", "gisaid" or "cog_global" +#"fasta": "unaligned", "aligned", "trimmed", "cog_global" or "gisaid" +#"metadata_fields": [] +#"mutations": True or False to add columns from mutations +#"constellations": True or False to add columns from constellations +#"updown": True or False to add columns from updown +#"shuffle": True to shuffle rows of metadata +#"where": free text to be passed to fastafunk fetch --where-column +#"suffix": something to append to file names +#"exclude_uk": True or False to exclude samples from UK +#"uk_only": True or False to include only samples from UK from cog_global metadata +#"drop_index": name of index column that should be dropped at the end + +def get_info_from_config(config_dict, outdir, date, fasta_dict, csv_dict, mutations_file, constellations_file, updown_file): + info_dict = {"suffix":None, "data":None, "fasta":None, "metadata_fields":None, + "where": None, "mutations":False, "constellations":False, "updown":False, + "shuffle":False, "drop_index": None, + "exclude_uk":False, "uk_only": False, "exclude_cog":False, "cog_only": False, + "date": date, + "in_fa":None, "in_csv":None, "in_muts":None, "in_con":None, "in_up": None, + "out_fa":"tmp.fa", "intermediate_csv":"tmp.csv", "out_csv":"tmp.csv"} + info_dict.update(config_dict) + + if info_dict["fasta"] in fasta_dict.keys(): + info_dict["in_fa"] = fasta_dict[info_dict["fasta"]] + elif info_dict["data"] == "cog_global": + info_dict["in_fa"] = fasta_dict["cog_global"] + elif info_dict["data"] == "gisaid": + info_dict["in_fa"] = fasta_dict["gisaid"] + elif info_dict["data"] == "cog": + info_dict["in_fa"] = fasta_dict["trimmed"] + else: + sys.exit("Config entries need to specify either fasta in ['unaligned', 'aligned', 'trimmed', 'cog_global', 'gisaid'] or data \ + in ['cog', 'cog_global', 'gisaid']") + + if info_dict["data"] is None: + if info_dict["fasta"] == "cog_global": + info_dict["data"] = "cog_global" + elif info_dict["fasta"] == "gisaid": + info_dict["data"] = "gisaid" + else: + info_dict["data"] = "cog" + + if info_dict["data"] == "cog_global": + info_dict["in_csv"] = csv_dict["cog_global"] + elif info_dict["data"] == "cog": + info_dict["in_csv"] = csv_dict["cog"] + elif info_dict["data"] == "gisaid": + info_dict["in_csv"] = csv_dict["gisaid"] + + info_dict["in_muts"] = mutations_file + info_dict["in_con"] = constellations_file + info_dict["in_up"] = updown_file + + start = "%s/%s_%s" %(outdir, info_dict["data"], info_dict["date"]) + if info_dict["suffix"]: + start += "_%s" %info_dict["suffix"] + csv_end = ".csv" + + if info_dict["fasta"]: + csv_end = "_metadata.csv" + if info_dict["fasta"]=="aligned" or (info_dict["metadata_fields"] and info_dict["fasta"]!="unaligned"): + info_dict["out_fa"] = "%s_alignment.fa" %start + else: + info_dict["out_fa"] = "%s.fa" %start + + info_dict["out_csv"] = "%s%s" %(start, csv_end) + + if info_dict["out_fa"] != "tmp.fa" and info_dict["in_fa"] is None: + sys.exit("Please provide the appropriate FASTA file") + if info_dict["metadata_fields"] is not None and info_dict["in_csv"] is None: + sys.exit("Please provide the appropriate CSV file") + if info_dict["mutations"] is not None and info_dict["in_muts"] is None: + sys.exit("Please provide the appropriate mutations file") + if info_dict["constellations"] is not None and info_dict["in_con"] is None: + sys.exit("Please provide the appropriate constellations file") + if info_dict["updown"] is not None and info_dict["in_up"] is None: + sys.exit("Please provide the appropriate updown file") + + print(info_dict) + return info_dict + +def syscall(cmd_list, allow_fail=False): + if None in cmd_list: + print('None in list', cmd_list, file=sys.stderr) + raise Error('Error in command. Cannot continue') + command = ' '.join(cmd_list) + print(command) + completed_process = subprocess.run(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) + if (not allow_fail) and completed_process.returncode != 0: + print('Error running this command:', command, file=sys.stderr) + print('Return code:', completed_process.returncode, file=sys.stderr) + print('\nOutput from stdout:', completed_process.stdout, sep='\n', file=sys.stderr) + print('\nOutput from stderr:', completed_process.stderr, sep='\n', file=sys.stderr) + raise Error('Error in system call. Cannot continue') + print(completed_process.stdout) + return completed_process + +def publish_file(outdir, info_dict): + if info_dict["metadata_fields"] is None: + cmd_list = ["cp", info_dict["in_fa"], info_dict["out_fa"]] + syscall(cmd_list) + return + + if info_dict["exclude_uk"]: + cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], + "--out-metadata tmp.no_uk.csv --column is_uk --is_true"] + syscall(cmd_list) + info_dict["in_csv"] = "tmp.no_uk.csv" + + if info_dict["exclude_cog"]: + cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], + "--out-metadata tmp.no_cog.csv --column is_cog_uk --is_true"] + syscall(cmd_list) + info_dict["in_csv"] = "tmp.no_cog.csv" + + if info_dict["uk_only"]: + cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], + "--out-metadata tmp.uk_only.csv --column is_uk --is_false"] + syscall(cmd_list) + info_dict["in_csv"] = "tmp.uk_only.csv" + + if info_dict["cog_only"]: + cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], + "--out-metadata tmp.cog_only.csv --column is_cog_uk --is_false"] + syscall(cmd_list) + info_dict["in_csv"] = "tmp.cog_only.csv" + + if info_dict["shuffle"]: + cmd_list = ["fastafunk shuffle --in-metadata", info_dict["in_csv"], "--out-metadata", "tmp.shuffled.csv"] + syscall(cmd_list) + info_dict["in_csv"] = "tmp.shuffled.csv" + + cmd_list = ["fastafunk fetch --in-fasta", info_dict["in_fa"], "--in-metadata", info_dict["in_csv"], + "--index-column sequence_name --out-fasta", info_dict["out_fa"], + "--out-metadata", info_dict["intermediate_csv"], "--restrict --low-memory"] + + if info_dict["metadata_fields"]: + if "why_excluded" in info_dict["metadata_fields"]: + cmd_list.append("--keep-omit-rows") + cmd_list.append("--filter-column") + cmd_list.extend(info_dict["metadata_fields"]) + + if info_dict["where"]: + cmd_list.append("--where-column %s" %info_dict["where"]) + syscall(cmd_list) + + if info_dict["mutations"]: + cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], + "--in-data", info_dict["in_muts"], "--index-column sequence_name", + "--join-on sequence_name --out-metadata tmp.muts.csv"] + info_dict["intermediate_csv"] = "tmp.muts.csv" + syscall(cmd_list) + + if info_dict["constellations"]: + cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], + "--in-data", info_dict["in_con"], "--index-column sequence_name", + "--join-on sequence_name --out-metadata tmp.constellations.csv"] + info_dict["intermediate_csv"] = "tmp.constellations.csv" + syscall(cmd_list) + + if info_dict["updown"]: + cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], + "--in-data", info_dict["in_up"], "--index-column sequence_name", + "--join-on query --out-metadata tmp.updown.csv"] + info_dict["intermediate_csv"] = "tmp.updown.csv" + syscall(cmd_list) + + if info_dict["drop_index"]: + cmd_list = ["fastafunk drop_columns --in-metadata", info_dict["intermediate_csv"], + "--columns", info_dict["drop_index"], + "--out-metadata tmp.anon.csv"] + info_dict["intermediate_csv"] = "tmp.anon.csv" + syscall(cmd_list) + + + cmd_list = ["mv", info_dict["intermediate_csv"], info_dict["out_csv"]] + syscall(cmd_list) + + #tmp = glob.glob("tmp.*") + #if len(tmp) > 0: + # cmd_list = ["rm tmp.*"] + # syscall(cmd_list) + +def main(): + args = parse_args() + print(args) + fasta_dict = {"unaligned":args.unaligned_fasta, "aligned":args.aligned_fasta, "trimmed":args.trimmed_fasta, "cog_global": args.cog_global_fasta, "gisaid": args.global_fasta} + print(fasta_dict) + csv_dict = {"cog":args.cog_metadata, "cog_global":args.cog_global_metadata, "gisaid": args.global_metadata} + print(csv_dict) + mutations_file = args.mutations + print(mutations_file) + constellations_file = args.constellations + print(constellations_file) + updown_file = args.updown + print(updown_file) + + recipes = {} + with open(args.recipes, 'r') as f: + recipes = json.load(f) + + for outdir in recipes.keys(): + os.makedirs(outdir,exist_ok=True) + for recipe in recipes[outdir]: + info_dict = get_info_from_config(recipe, outdir, args.date, fasta_dict, csv_dict, mutations_file, constellations_file, updown_file) + publish_file(outdir, info_dict) + +if __name__ == '__main__': + main() diff --git a/workflows/bin/remove_duplicates_by_date.py b/workflows/bin/remove_duplicates_by_date.py new file mode 100755 index 0000000..30786e0 --- /dev/null +++ b/workflows/bin/remove_duplicates_by_date.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from Bio import SeqIO +import csv + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + parser.add_argument('--out-fasta', dest = 'out_fasta', required=True, help='FASTA to write out') + parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') + + args = parser.parse_args() + + return args + +def run(in_fasta, in_metadata, out_fasta, out_metadata): + dup_dict = {} + tokeep = set() + + with open(in_metadata, 'r', newline = '') as csv_in: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + + for row in reader: + if row["why_excluded"]: + continue + + fasta_header = row["edin_header"] + id = row["sequence_name"] + epi_day = int(row["edin_epi_day"]) + completeness = float(row["unmapped_genome_completeness"]) + + if id in ["None", "", None]: + tokeep.add(fasta_header) + continue + + if id in dup_dict: + if epi_day < dup_dict[id][0]["epi_day"]: + dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) + else: + dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) + else: + dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}] + + with open("deduplicated.log", "w") as log: + for k,v in dup_dict.items(): + tokeep.add(v[0]["fasta_header"]) + if len(v) > 1: + for dup in v[1:]: + log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\n" \ + %(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \ + dup["epi_day"], dup["completeness"])) + + alignment = SeqIO.index(in_fasta, "fasta") + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out, \ + open(out_fasta, 'w') as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + fasta_header = row["edin_header"] + + if fasta_header in tokeep: + writer.writerow(row) + seqrec = alignment[fasta_header] + fasta_out.write(">" + seqrec.id + "\n") + fasta_out.write(str(seqrec.seq) + "\n") + else: + if not row["why_excluded"]: + row["why_excluded"] = "duplicate sequence_name" + writer.writerow(row) + +def main(): + args = parse_args() + run(args.in_fasta, args.in_metadata, args.out_fasta, args.out_metadata) + +if __name__ == '__main__': + main() diff --git a/workflows/bin/summarise_genomes_by_utla.py b/workflows/bin/summarise_genomes_by_utla.py new file mode 100755 index 0000000..86b7e24 --- /dev/null +++ b/workflows/bin/summarise_genomes_by_utla.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import csv +from collections import defaultdict +from collections import Counter +import datetime as dt + +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--metadata", action="store") +parser.add_argument("--date", action='store') +args = parser.parse_args() + +metadata = args.metadata +file_date = args.date + +def main(metadata, file_date): + + utla_to_region = {} + utla_to_code = {} + + with open(metadata) as f: + data = csv.DictReader(f) + for l in data: + if l['utla'] != "" and "|" not in l['utla']: + utla_to_region[l['utla']] = l['NUTS1'] + utla_to_code[l['utla']] = l['utla_code'] + + utla_delta = defaultdict(list) + utla_other = defaultdict(list) + utla_all = defaultdict(list) + with open(metadata) as f: + data = csv.DictReader(f) + for l in data: + if l['sample_date'] != "": + date = dt.datetime.strptime(l['sample_date'],"%Y-%m-%d").date() + if l['utla'] != "" and "|" not in l['utla']: + if l['scorpio_call'] == "Delta (B.1.617.2-like)": + utla_delta[date].append(l['utla']) + else: + utla_other[date].append(l['utla']) + + utla_all[date].append(l['utla']) + + delta_counts = {} + other_counts = {} + all_counts = {} + + for k,v in utla_delta.items(): + delta_counts[k] = Counter(v) + + for k,v in utla_other.items(): + other_counts[k] = Counter(v) + + for k,v in utla_all.items(): + all_counts[k] = Counter(v) + + fieldnames = ["date", "utla", "utla_code", "NUTS1", "delta_count", "other_count", "total_count"] + with open(f"UTLA_genome_counts_{file_date}.csv", 'w') as fw: + writer = csv.DictWriter(fw, fieldnames=fieldnames) + writer.writeheader() + for date, utla_dict in sorted(all_counts.items()): + for utla, count in utla_dict.items(): + write_dict = {} + write_dict["date"] = date + write_dict["utla"] = utla + write_dict["utla_code"] = utla_to_code[utla] + write_dict["NUTS1"] = utla_to_region[utla] + write_dict["total_count"] = count + if date in delta_counts: + if utla in delta_counts[date]: + write_dict["delta_count"] = delta_counts[date][utla] + else: + write_dict["delta_count"] = 0 + else: + write_dict["delta_count"] = 0 + + if date in other_counts: + if utla in other_counts[date]: + write_dict["other_count"] = other_counts[date][utla] + else: + write_dict["other_count"] = 0 + else: + write_dict["other_count"] = 0 + + writer.writerow(write_dict) + + +if __name__ == '__main__': + main(metadata, file_date) \ No newline at end of file diff --git a/workflows/bin/type_aas_and_dels.py b/workflows/bin/type_aas_and_dels.py new file mode 100755 index 0000000..10a218f --- /dev/null +++ b/workflows/bin/type_aas_and_dels.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from Bio import SeqIO +import csv + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add columns to metadata for specific AAs and dels""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='Aligned FASTA') + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='CSV of metadata to add to') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + parser.add_argument('--reference-fasta', dest = 'reference_fasta', required=True, help='Reference FASTA') + parser.add_argument('--aas', dest = 'aas', required=False, help='CSV of AAs') + parser.add_argument('--dels', dest = 'dels', required=False, help='CSV of deletions') + parser.add_argument('--index-column', dest = 'index_column', required=False, default='sequence_name') + + args = parser.parse_args() + return args + + +def parse_AA_file(file): + """ + input is in the format: + start (1-based) + e.g.: + D614G,1605 + + ls is a list of length-2 tuples with the format (name, position) + position is the 1-based starting position of the codon in Wuhan-Hu-1 coordinates + It has the same number of entries as lines in file + """ + ls = [] + if not file: + return ls + + with open(file, 'r') as f: + for line in f: + l = line.rstrip().split(",") + name, pos = l + ls = ls + [(name, int(pos))] + return(ls) + +def parse_del_file(file, ref_fasta): + """ + input is in the format: + start (1-based), length of deletion + e.g.: + 1605,3 + + ls is a list of length-3 tuples with the format (position, length, ref_allele) + It has the same number of entries as lines in file + """ + ls = [] + if not file: + return ls + WuhanHu1 = SeqIO.read(ref_fasta, 'fasta') + + with open(file, 'r') as f: + for line in f: + l = line.rstrip().split(',') + pos, length = l + ref_allele = str(WuhanHu1.seq).upper()[int(pos) - 1: int(pos) - 1 + int(length)] + ls = ls + [(int(pos), int(length), ref_allele)] + + return(ls) + +def type_aas_and_dels(in_fasta, in_aa_file, in_del_file, reference_fasta, in_metadata, out_metadata, index_column): + alignment = SeqIO.index(in_fasta, "fasta") + AAs = parse_AA_file(in_aa_file) + dels = parse_del_file(in_del_file, reference_fasta) + + new_aa_columns = [x[0] for x in AAs] + new_del_columns = ["del_" + str(x[0]) + "_" + str(x[1]) for x in dels] + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + new_aa_columns + new_del_columns, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + id = row[index_column] + seq = alignment[id].seq + + for entry in AAs: + pos = entry[1] + try: + QUERY_allele = seq[pos - 1: pos + 2].translate() + except: + QUERY_allele = 'X' + row[entry[0]] = QUERY_allele + + for entry in dels: + pos = entry[0] + length = entry[1] + ref_allele = entry[2] + column_name = "del_" + str(pos) + "_" + str(length) + + if seq[pos - 1: pos - 1 + length] == '-' * length: + genotype = 'del' + elif seq[pos - 1: pos - 1 + length] == ref_allele: + genotype = 'ref' + else: + genotype = 'X' + + row[column_name] = genotype + + writer.writerow(row) + +def main(): + args = parse_args() + type_aas_and_dels(args.in_fasta, args.aas, args.dels, args.reference_fasta, args.in_metadata, args.out_metadata, args.index_column) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workflows/bin/uk_label_sourceid_duplicates_to_omit.py b/workflows/bin/uk_label_sourceid_duplicates_to_omit.py new file mode 100755 index 0000000..d282901 --- /dev/null +++ b/workflows/bin/uk_label_sourceid_duplicates_to_omit.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from Bio import SeqIO +import csv + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + + args = parser.parse_args() + + return args + +def run(in_metadata, out_metadata): + dup_dict = {} + tokeep = set() + + with open(in_metadata, 'r', newline = '') as csv_in: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + + for row in reader: + fasta_header = row["sequence_name"] + id = row["source_id"] + epi_day = int(row["edin_epi_day"]) + completeness = float(row["unmapped_genome_completeness"]) + + if id in ["None", "", None]: + tokeep.add(fasta_header) + continue + + if id in dup_dict: + if epi_day < dup_dict[id][0]["epi_day"]: + dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) + else: + dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) + else: + dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}] + + with open("deduplicated_by_sourceid.log", "w") as log: + for k,v in dup_dict.items(): + tokeep.add(v[0]["fasta_header"]) + if len(v) > 1: + for dup in v[1:]: + log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\n" \ + %(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \ + dup["epi_day"], dup["completeness"])) + + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["duplicate"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + row["duplicate"] = None + fasta_header = row["sequence_name"] + if fasta_header not in tokeep: + row["duplicate"] = "True" + writer.writerow(row) + +def main(): + args = parse_args() + run(args.in_metadata, args.out_metadata) + +if __name__ == '__main__': + main() diff --git a/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py b/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py new file mode 100755 index 0000000..185a345 --- /dev/null +++ b/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from Bio import SeqIO +import csv + +def parse_args(): + parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') + parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') + parser.add_argument('--out-fasta', dest = 'out_fasta', required=True, help='FASTA to write out') + parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') + + args = parser.parse_args() + + return args + +def run(in_fasta, in_metadata, out_fasta, out_metadata): + alignment = SeqIO.index(in_fasta, "fasta") + + dup_dict = {} + tokeep = set() + + with open(in_metadata, 'r', newline = '') as csv_in: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + + for row in reader: + if row["why_excluded"]: + continue + fasta_header = row["fasta_header"] + id = row["central_sample_id"] + completeness = float(row["unmapped_genome_completeness"]) + + if id in dup_dict: + if completeness > dup_dict[id]["completeness"]: + dup_dict[id] = {"fasta_header": fasta_header, "completeness": completeness} + else: + continue + else: + dup_dict[id] = {"fasta_header": fasta_header, "completeness": completeness} + + for k,v in dup_dict.items(): + tokeep.add(v["fasta_header"]) + + with open(in_metadata, 'r', newline = '') as csv_in, \ + open(out_metadata, 'w', newline = '') as csv_out, \ + open(out_fasta, 'w') as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + fasta_header = row["fasta_header"] + + if fasta_header in tokeep: + writer.writerow(row) + seqrec = alignment[fasta_header] + fasta_out.write(">" + seqrec.id + "\n") + fasta_out.write(str(seqrec.seq) + "\n") + else: + if not row["why_excluded"]: + row["why_excluded"] = "duplicate central_sample_id" + writer.writerow(row) + +def main(): + args = parse_args() + run(args.in_fasta, args.in_metadata, args.out_fasta, args.out_metadata) + +if __name__ == '__main__': + main() diff --git a/workflows/config/base.config b/workflows/config/base.config new file mode 100644 index 0000000..df760b1 --- /dev/null +++ b/workflows/config/base.config @@ -0,0 +1,64 @@ +// Base parameters used throughout +params { + whoami = "Datapipe" + date = false + publish_dir = "publish" + publish_dev = "publish_dev" + category = "cog" + webhook = false + s3 = false + distance_qc = false + geography = false + cache_pangolin = false + + // new cog-uk files each week + uk_fasta = "test/matched.fa" + uk_metadata = "test/matched.tsv" + uk_accessions = "test/accessions.tsv" + uk_unaligned_fasta = "test/matched2.fa" // null param so exists + uk_aligned_fasta = "test/matched3.fa" // null param so exists + uk_mutations = "test/matched2.variants" // null param so exists + uk_constellations = "resources/empty_constellations.csv" // null so exists + uk_pag = "test/uk_pag.tsv" //null param + + // if carrying forward from previous + previous_metadata = "" + + // latest gisaid results output by gisaid pipeline + gisaid_json = "test/gisaid.json" + gisaid_fasta = "test/gisaid.matched.fa" // null so exists + gisaid_metadata = "test/gisaid.matched.csv" // null so exists + gisaid_mutations = "resources/empty_mutations.csv" // null so exists + gisaid_constellations = "resources/empty_constellations.csv" // null so exists + gisaid_updown = "resources/empty_updown.csv" // null so exists + + + // resources files + uk_updated_dates = "resources/date_corrections.csv" + uk_omissions = "resources/resequencing_omissions.txt" + gisaid_omissions = "resources/gisaid_omissions.txt" + reference_fasta = "resources/MN908947.fa" + reference_genbank = "resources/MN908947.gb" + WH04_fasta = "resources/WH04.fa" + aas = "resources/AAs.csv" + dels = "resources/dels.csv" + constellations = "resources/constellations" + mask_file = "resources/mask.txt" + uk_geography = "bin/geography_cleaning/geography_utils/" + publish_cog_global_recipes = "resources/publish_cog_global_recipes.json" + publish_gisaid_recipes = "resources/publish_gisaid_recipes.json" + + + // parameter values set + time_window = false + update_all_lineage_assignments = false + auto_update_pangolin = false + skip_designation_hash = false + add_usher_pangolin = false + min_covg = 93 + trim_start = 265 + trim_end = 29674 + chunk_size = 10000 + constellations = '"Delta (B.1.617.2-like)" "Omicron (B.1.1.529-like)" "Omicron (BA.1-like)" "Omicron (BA.2-like)" "Omicron (BA.3-like)" "Omicron (Unassigned)"' + +} diff --git a/workflows/environment.yml b/workflows/environment.yml new file mode 100644 index 0000000..77f736d --- /dev/null +++ b/workflows/environment.yml @@ -0,0 +1,32 @@ +name: datapipe +channels: + - bioconda + - conda-forge + - defaults + - cov-ert +dependencies: + - biopython>=1.74 + - minimap2>=2.17 + - pip=19.3.1 + - python>=3.7 + - snakemake-minimal>=6.4.1,<=6.8.0 + - gofasta<=0.0.4 + - pysam==0.16.0.1 + - usher>=0.3.2 + - coreutils>=8.25 + - nextflow + - s3cmd + - smart_open + - datafunk + - fastafunk + - pip: + - ftfy + - geopandas + - git+https://github.com/cov-lineages/pangolin.git + - git+https://github.com/cov-lineages/pangoLEARN.git + - git+https://github.com/cov-lineages/constellations.git + - git+https://github.com/cov-lineages/scorpio.git + - git+https://github.com/cov-lineages/pango-designation.git + - git+https://github.com/cov-lineages/pangolin-assigment.git + + diff --git a/workflows/environment.yml.old b/workflows/environment.yml.old new file mode 100644 index 0000000..5dab06f --- /dev/null +++ b/workflows/environment.yml.old @@ -0,0 +1,32 @@ +name: datapipe +channels: + - bioconda + - conda-forge + - defaults + - cov-ert +dependencies: + - biopython>=1.74 + - minimap2>=2.17 + - pip=19.3.1 + - python>=3.7 + - snakemake-minimal>=6.4.1,<=6.8.0 + - gofasta<=0.0.4 + - pysam==0.16.0.1 + - usher>=0.3.2 + - coreutils>=8.25 + - nextflow + - s3cmd + - smart_open + - pip: + - ftfy + - geopandas + - git+https://github.com/cov-lineages/pangolin.git + - git+https://github.com/cov-lineages/pangoLEARN.git + - git+https://github.com/cov-ert/datafunk.git + - git+https://github.com/cov-ert/fastafunk.git + - git+https://github.com/cov-lineages/constellations.git + - git+https://github.com/cov-lineages/scorpio.git + - git+https://github.com/cov-lineages/pango-designation.git + - git+https://github.com/cov-lineages/pangolin-assigment.git + + diff --git a/workflows/future_improvements b/workflows/future_improvements new file mode 100644 index 0000000..a1aa214 --- /dev/null +++ b/workflows/future_improvements @@ -0,0 +1,21 @@ +- preprocess_cog_uk takes a file of updated dates: could this be fed back into majora so +no longer needed? +- omissions file: is it really necessary still? +- what is best practice: add lots of inputs, or have global params used by processes and minimal inputs? +- in general, fix inputs/params so cast as a file/path at the right point allowing no file in some cases +- what should desired result be if missing input files e.g. list of aas/dels to search for and add to metadata table. +Set up now to skip that step +- Lots of very similar looking python scripts within processes - these were fastafunks, but were replaced +to speed up. Could instead speed up fastafunk in the same way. +- Used to retain info week to week about which samples were eliminated as duplicates - this is now done denovo each +week which is probably desirable behaviour? +- Used to have min length and min covg thresholds, now have just one lower min_covg threshold because if not tree +building don't need higher covg? +- Changes from before: when publishing use recipes + - remove mutations from consortium metadata, add them to variants metadata + - remove phylogenetics columns, make phylogenetics metadata later +- might want to publish developer info to a directory e.g. geography outputs inc new dodgy stuff +- command line help and specify required arguments +- containerize and get rid of conda environment - can be parsed down at the same time as includes things from other +steps of the old pipeline +- add back in resource requirements diff --git a/workflows/modules/align_and_variant_call.nf b/workflows/modules/align_and_variant_call.nf new file mode 100644 index 0000000..d8fe713 --- /dev/null +++ b/workflows/modules/align_and_variant_call.nf @@ -0,0 +1,560 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dir = file(params.publish_dir) +publish_dev = file(params.publish_dev) + + +process minimap2_to_reference { + /** + * Minimaps samples to reference + * @input fasta + * @output sam + * @params reference_fasta + */ + + cpus 4 + + input: + path fasta + + output: + path "alignment.sam" + + script: + """ + minimap2 -t ${task.cpus} -a --secondary=no --score-N=0 -x asm20 ${reference_fasta} ${fasta} > alignment.sam + """ +} + +process get_mutations { + /** + * Creates CSV of mutations found in each genome + * @input sam + * @output mutations + * @parms reference_fasta, reference_genbank + */ + + cpus 4 + label 'retry_increasing_mem' + + + input: + path sam + val category + + output: + path "${category}.mutations.csv" + + script: + """ + gofasta sam variants -t ${task.cpus} \ + --samfile ${sam} \ + --reference ${reference_fasta} \ + --genbank ${reference_genbank} \ + --outfile ${category}.mutations.csv + """ +} + +process get_indels { + /** + * Creates TSV of indels found in each genome + * @input sam + * @output insertions, deletions + */ + + publishDir "${publish_dev}/", pattern: "*/*.tsv", mode: 'copy' + publishDir "${publish_dir}/", pattern: "*/*.tsv", mode: 'copy', enabled: { ${category} == 'cog'} + + input: + path sam + val category + + output: + path "${category}/${category}.insertions.tsv", emit: insertions + path "${category}/${category}.deletions.tsv", emit: deletions + + script: + """ + mkdir -p ${category} + gofasta sam indels \ + -s ${sam} \ + --threshold 2 \ + --insertions-out "${category}/${category}.insertions.tsv" \ + --deletions-out "${category}/${category}.deletions.tsv" + """ +} + +process alignment { + /** + * Get reference-based alignment + * @input sam + * @output alignment + * @params reference_fasta + */ + + cpus 4 + + input: + path sam + + output: + path "alignment.fasta" + + script: + """ + gofasta sam toMultiAlign -t ${task.cpus} \ + --samfile ${sam} \ + --reference ${reference_fasta} \ + --pad \ + -o alignment.fasta + """ +} + + +process get_snps { + /** + * Call SNPs in each genome + * @input alignment + * @output snps + * @params reference_fasta + */ + + publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' + + input: + path alignment + val category + + output: + path "${category}/${category}.snps.csv" + + script: + """ + mkdir -p ${category} + gofasta snps -r ${reference_fasta} -q ${alignment} -o ${category}/${category}.snps.csv + """ +} + +process get_updown { + /** + * Call SNPs in each genome + * @input alignment + * @output updown list + * @params reference_fasta + */ + + publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' + + input: + path alignment + val category + + output: + path "${category}/${category}.updown.csv" + + script: + """ + mkdir -p ${category} + gofasta updown list -r ${WH04_fasta} -q ${alignment} -o ${category}/${category}.updown.csv + """ +} + +process type_AAs_and_dels { + /** + * Adds a column to metadata table for specific dels and aas looked for + * @input alignment, metadata + * @output metadata_updated + * @params reference_fasta, del_file, aa_file + */ + + input: + path alignment + path metadata + + output: + path "${metadata.baseName}.aas_dels.csv" + + script: + """ + $project_dir/../bin/type_aas_and_dels.py \ + --in-fasta ${alignment} \ + --in-metadata ${metadata} \ + --out-metadata "mutations.tmp.csv" \ + --reference-fasta ${reference_fasta} \ + --aas ${aas} \ + --dels ${dels} \ + --index-column query + sed "s/query/sequence_name/g" "mutations.tmp.csv" > mutations.tmp2.csv + sed "s/variants/mutations/g" "mutations.tmp2.csv" > "${metadata.baseName}.aas_dels.csv" + + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.aas_dels.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${metadata.baseName}.aas_dels.csv" | wc -l) + exit 1 + fi + """ +} + +process get_nuc_mutations { + /** + * Combines nucleotide mutations into a metadata file which can be merged into the master + * @input snps, dels, ins + * @output metadata + */ + + input: + path snps + path dels + path ins + + output: + path "nuc_mutations.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + sample_dict = {} + with open("${dels}", 'r', newline = '') as csv_in: + for line in csv_in: + ref_start, length, samples = line.strip().split() + samples = samples.split('|') + var = "del_%s_%s" %(ref_start, length) + for sample in samples: + if sample in sample_dict: + sample_dict[sample].append(var) + else: + sample_dict[sample] = [var] + + with open("${ins}", 'r', newline = '') as csv_in: + for line in csv_in: + ref_start, insertion, samples= line.strip().split() + samples = samples.split('|') + var = "ins_%s_%s" %(ref_start, insertion) + for sample in samples: + if sample in sample_dict: + sample_dict[sample].append(var) + else: + sample_dict[sample] = [var] + + with open("${snps}", 'r', newline = '') as csv_in, \ + open("nuc_mutations.csv", 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = ["sequence_name", "nucleotide_mutations"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + row["sequence_name"] = row["query"] + row["nucleotide_mutations"] = row["SNPs"] + if row["sequence_name"] in sample_dict: + all_vars = [row["nucleotide_mutations"]] + all_vars.extend(sample_dict[row["sequence_name"]]) + row["nucleotide_mutations"] = '|'.join(all_vars) + for key in [k for k in row if k not in ["sequence_name", "nucleotide_mutations"]]: + del row[key] + writer.writerow(row) + """ +} + + +process restrict_metadata { + /** + * restricts only to sequences not excluded + * @input metadata + * @output metadata + */ + + input: + path metadata + + output: + path "${metadata.baseName}.restricts.csv" + + script: + """ + #!/usr/bin/env python3 + import csv + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.restricts.csv", 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + if row["why_excluded"] not in [None, "", "None"]: + writer.writerow(row) + """ +} + + +process add_nucleotide_mutations_to_metadata { + /** + * Adds nucleotide mutations to metadata + * @input metadata, nucleotide_mutations + * @output metadata + */ + + memory { 1.GB * task.attempt + metadata.size() * 2.B } + + input: + path metadata + path nucleotide_mutations + + output: + path "${metadata.baseName}.with_nuc_mutations.csv" + + script: + """ + fastafunk add_columns \ + --in-metadata ${metadata} \ + --in-data ${nucleotide_mutations} \ + --index-column sequence_name \ + --join-on sequence_name \ + --new-columns nucleotide_mutations \ + --out-metadata "${metadata.baseName}.with_nuc_mutations.csv" + + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.with_nuc_mutations.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${metadata.baseName}.with_nuc_mutations.csv" | wc -l) + exit 1 + fi + """ +} + +process add_ambiguities_to_metadata { + /** + * Adds nucleotide mutations to metadata + * @input metadata, nucleotide_mutations + * @output metadata + */ + + memory { 1.GB * task.attempt + metadata.size() * 2.B } + publishDir "${publish_dev}/", pattern: "*/*.csv", mode: 'copy' + + input: + path metadata + path updown + val category + + output: + path "${category}/${category}_mutations.csv" + + script: + """ + mkdir -p ${category} + fastafunk add_columns \ + --in-metadata ${metadata} \ + --in-data ${updown} \ + --index-column sequence_name \ + --join-on query \ + --new-columns ambiguities \ + --out-metadata "${category}/${category}_mutations.csv" + + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${category}/${category}_mutations.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${category}/${category}_mutations.csv" | wc -l) + exit 1 + fi + """ +} + + +process haplotype_constellations { + /** + * Adds a column to metadata table for each constellation, and a summary column for all found + * @input alignment + * @output haplotype_csv + * @params constellations + */ + + input: + path alignment + + output: + path "${alignment.baseName}.haplotyped.csv" + + script: + """ + scorpio haplotype \ + --input ${alignment} \ + --output "${alignment.baseName}.haplotyped.csv" \ + --output-counts \ + -n ${params.constellations} + + if [[ \$(grep ">" "${alignment}" | wc -l) != \$(tail -n+2 "${alignment.baseName}.haplotyped.csv" | wc -l) ]] + then + echo \$(grep ">" "${alignment}" | wc -l) + echo \$(tail -n+2 "${alignment.baseName}.haplotyped.csv" | wc -l) + exit 1 + fi + """ +} + +process classify_constellations { + /** + * Adds a column to metadata table for each constellation, and a summary column for all found + * @input alignment + * @output classify_csv + * @params constellations + */ + + input: + path alignment + + output: + path "${alignment.baseName}.classified.csv" + + script: + """ + scorpio classify \ + --input ${alignment} \ + --output "${alignment.baseName}.classified.csv" \ + -n ${params.constellations} + + if [[ \$(grep ">" "${alignment}" | wc -l) != \$(tail -n+2 "${alignment.baseName}.classified.csv" | wc -l) ]] + then + echo \$(grep ">" "${alignment}" | wc -l) + echo \$(tail -n+2 "${alignment.baseName}.classified.csv" | wc -l) + exit 1 + fi + """ +} + +process add_constellations_to_metadata { + /** + * Adds constellations to metadata + * @input metadata, haplotyped, classified + * @output metadata + */ + + publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' + + memory { task.attempt * (classified.size() + haplotyped.size()) * 9.B } + + input: + path haplotyped + path classified + val category + + output: + path "${category}/${category}_constellations.csv" + + script: + """ + mkdir -p ${category} + fastafunk add_columns \ + --in-metadata ${classified} \ + --in-data ${haplotyped} \ + --index-column query \ + --join-on query \ + --out-metadata "constellations.tmp.csv" + sed "s/query/sequence_name/g" "constellations.tmp.csv" > "${category}/${category}_constellations.csv" + + if [[ \$(cat "${haplotyped}" | wc -l) != \$(cat "${category}/${category}_constellations.csv" | wc -l) ]] + then + echo \$(cat "${haplotyped}" | wc -l) + echo \$(cat "${category}/${category}_constellations.csv" | wc -l) + exit 1 + fi + """ +} + + +process announce_summary { + /** + * Summarizes alignment into JSON + * @input fastas + */ + + input: + path fasta + path alignment + + output: + path "announce.json" + + script: + if (params.webhook) + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Finished alignment and variant calling ${params.date}*\\n" >> announce.json + echo "> Number of sequences in FASTA : \$(cat ${fasta} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences in ALIGNMENT : \$(cat ${alignment} | grep '>' | wc -l)\\n" >> announce.json + echo '"}' >> announce.json + + echo 'webhook ${params.webhook}' + + curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} + """ + else + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Finished alignment and variant calling ${params.date}*\\n" >> announce.json + echo "> Number of sequences in FASTA : \$(cat ${fasta} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences in ALIGNMENT : \$(cat ${alignment} | grep '>' | wc -l)\\n" >> announce.json + echo '"}' >> announce.json + """ +} + +workflow align_and_variant_call { + take: + in_fasta + in_metadata + category + main: + in_fasta.splitFasta( by: params.chunk_size, file: true ).set{ fasta_chunks } + minimap2_to_reference(fasta_chunks) + alignment(minimap2_to_reference.out) + alignment.out.collectFile(newLine: false).set{ alignment_result } + minimap2_to_reference.out.collectFile(newLine: false, keepHeader: true, skip: 2).set{ mapped_result } + + + get_mutations(mapped_result, category) + get_indels(mapped_result, category) + + get_snps(alignment_result, category) + get_updown(alignment_result, category) + type_AAs_and_dels(alignment_result, get_mutations.out) + get_nuc_mutations(get_snps.out, get_indels.out.deletions, get_indels.out.insertions) + add_nucleotide_mutations_to_metadata(in_metadata, get_nuc_mutations.out) + add_ambiguities_to_metadata(type_AAs_and_dels.out, get_updown.out, category) + + haplotype_constellations(alignment.out) + haplotype_constellations.out.collectFile(newLine: false, keepHeader: true, skip: 1).set{ haplotype_result } + classify_constellations(alignment.out) + classify_constellations.out.collectFile(newLine: false, keepHeader: true, skip: 1).set{ classify_result } + + add_constellations_to_metadata(haplotype_result, classify_result, category) + announce_summary(in_fasta, alignment_result) + emit: + mutations = add_ambiguities_to_metadata.out + constellations = add_constellations_to_metadata.out + fasta = alignment_result + metadata = add_nucleotide_mutations_to_metadata.out + updown = get_updown.out +} + + +aas = file(params.aas) +dels = file(params.dels) +reference_fasta = file(params.reference_fasta) +reference_genbank = file(params.reference_genbank) +WH04_fasta = file(params.WH04_fasta) + +workflow { + uk_fasta = Channel.fromPath(params.uk_fasta) + uk_metadata = Channel.fromPath(params.uk_metadata) + category = params.category + + align_and_variant_call(uk_fasta, uk_metadata, category) +} diff --git a/workflows/modules/clean_geography.nf b/workflows/modules/clean_geography.nf new file mode 100644 index 0000000..41d7939 --- /dev/null +++ b/workflows/modules/clean_geography.nf @@ -0,0 +1,288 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dir = file(params.publish_dir) +publish_dev = file(params.publish_dev) + + +process uk_geography { + /** + * Cleans up geography + * @input uk_fasta, uk_metadata + * @output geography_metadata + * @params geography_utils + */ + + memory { 1.GB * task.attempt + uk_fasta.size() * 1.B } + errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 1 + + publishDir "${publish_dev}/", pattern: "geography/*.csv", mode: 'copy' + publishDir "${publish_dev}/", pattern: "geography/*.txt", mode: 'copy' + + input: + path uk_fasta + path uk_metadata + + output: + path "geography/geography.csv", emit: geography + path "geography/*.csv" + path "geography/*.txt" + + script: + """ + mkdir geography + mkdir geography_tmp + + fastafunk fetch \ + --in-fasta ${uk_fasta} \ + --in-metadata ${uk_metadata} \ + --index-column sequence_name \ + --filter-column central_sample_id sequence_name sample_date edin_epi_week \ + adm0 adm1 adm2 adm2_private \ + --out-fasta geography_tmp/fetch.fa \ + --out-metadata geography_tmp/fetch.csv \ + --restrict + + $project_dir/../bin/geography_cleaning/geography_cleaning.py \ + --metadata geography_tmp/fetch.csv \ + --country-col adm0 \ + --adm1-col adm1 \ + --adm2-col adm2 \ + --outer-postcode-col adm2_private \ + --mapping-utils-dir ${geography_utils} \ + --epiweek-col edin_epi_week \ + --outdir geography + + #rm -rf geography_tmp + """ +} + + +process add_uk_geography_to_metadata { + /** + * Adds UK geography to uk metadata + * @input combined_metadata, geography_metadata + * @output metadata + */ + + publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_master.csv"} + memory { 1.GB * task.attempt + uk_metadata.size() * 2.B } + + input: + path uk_metadata + path geography_metadata + + output: + path "cog_geography.csv", emit: metadata + + script: + """ + fastafunk add_columns \ + --in-metadata ${uk_metadata} \ + --in-data ${geography_metadata} \ + --index-column sequence_name \ + --join-on sequence_name \ + --force-overwrite \ + --new-columns adm1 adm1_raw adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ + --out-metadata "cog_geography.csv" + """ +} + + +process gisaid_geography { + /** + * Cleans up geography + * @input gisaid_fasta, gisaid_metadata + * @output geography_metadata + * @params geography_utils + */ + + memory { 1.GB * task.attempt + fasta.size() * 1.B } + errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 1 + + publishDir "${publish_dev}/", pattern: "geography/*.csv", mode: 'copy' + publishDir "${publish_dev}/", pattern: "geography/*.txt", mode: 'copy' + + input: + path gisaid_fasta + path gisaid_metadata + + output: + path "geography/geography.csv", emit: geography + path "geography/*.csv" + path "geography/*.txt" + + script: + """ + mkdir geography + mkdir geography_tmp + + fastafunk fetch \ + --in-fasta ${fasta} \ + --in-metadata ${metadata} \ + --index-column sequence_name \ + --filter-column gisaid_accession sequence_name sample_date epi_week \ + adm0 adm1 adm2 adm2_private \ + --where-column gisaid_accession=covv_accession_id epi_week=edin_epi_week adm0=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2\ + --out-fasta geography_tmp/fetch.fa \ + --out-metadata geography_tmp/fetch.csv \ + --restrict + + $project_dir/../bin/geography_cleaning/geography_cleaning.py \ + --metadata geography_tmp/fetch.csv \ + --country-col adm0 \ + --adm1-col adm1 \ + --adm2-col adm2 \ + --outer-postcode-col adm2_private \ + --mapping-utils-dir ${geography_utils} \ + --epiweek-col epi_week \ + --sample-id-col gisaid_accession \ + --outdir geography + + rm -rf geography_tmp + """ +} + + +process add_gisaid_geography_to_metadata { + /** + * Adds GISAID geography to combined metadata + * @input gisaid_metadata, geography_metadata + * @output metadata + */ + + publishDir "${publish_dev}/gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"gisaid_master.csv"}, overwrite: true + memory { 1.GB * task.attempt + combined_metadata.size() * 2.B } + + input: + path gisaid_metadata + path geography_metadata + + output: + path "gisaid_geography.csv", emit: metadata + + script: + """ + fastafunk add_columns \ + --in-metadata ${gisaid_metadata} \ + --in-data ${geography_metadata} \ + --index-column sequence_name \ + --join-on sequence_name \ + --force-overwrite \ + --new-columns edin_admin_0 edin_admin_1 edin_admin_2 adm1 adm1_raw adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ + --where-column edin_admin_0=adm0 edin_admin_1=adm1 edin_admin_2=adm2 \ + --out-metadata "gisaid_geography.csv" + """ +} + + +process make_delta_by_utla_summary { + /** + * Summarizes delta counts by utla + * @input metadata + * @output csv + */ + + publishDir "${publish_dir}/cog", pattern: "*.csv", mode: 'copy', overwrite: false + + input: + path metadata + + output: + path "UTLA_genome_counts_${params.date}.csv" + + script: + """ + $project_dir/../bin/summarise_genomes_by_utla.py \ + --metadata ${metadata} \ + --date ${params.date} + """ +} + + +process drop_anon_id { + /** + * Drops anonymous ID from master metadata csv + * @input metadta + * @output metadata + */ + + input: + path metadata + + output: + path "${metadata.baseName}_anon.csv" + + script: + """ + fastafunk drop_columns --in-metadata ${metadata} --columns anonymous_sample_id --out-metadata ${metadata.baseName}_anon.csv + """ +} + +process publish_master_metadata { + /** + * Publishes master metadata csv for this category + * @input metadata + * @output metadata + */ + + publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' + + input: + path metadata + val category + + output: + path "${category}/${category}_master.csv" + + script: + """ + mkdir -p ${category} + cp ${metadata} ${category}/${category}_master.csv + """ +} + + +geography_utils = file(params.uk_geography) + + +workflow clean_geography_cog_uk { + take: + uk_fasta + uk_metadata + main: + uk_geography(uk_fasta, uk_metadata) + add_uk_geography_to_metadata(uk_metadata,uk_geography.out.geography) + make_delta_by_utla_summary(add_uk_geography_to_metadata.out.metadata) + drop_anon_id(add_uk_geography_to_metadata.out.metadata) + publish_master_metadata(drop_anon_id.out, "cog") + emit: + metadata = add_uk_geography_to_metadata.out.metadata +} + +workflow clean_geography_gisaid { + take: + gisaid_fasta + gisaid_metadata + main: + if ( params.geography ){ + gisaid_geography(gisaid_fasta, gisaid_metadata) + add_gisaid_geography_to_metadata(gisaid_metadata,gisaid_geography.out.geography) + add_gisaid_geography_to_metadata.out.metadata.set{ new_gisaid_metadata } + } else { + new_gisaid_metadata = gisaid_metadata + } + publish_master_metadata(new_gisaid_metadata, "gisaid") + emit: + metadata = new_gisaid_metadata +} + +workflow { + uk_fasta = Channel.fromPath(params.uk_fasta) + uk_metadata = Channel.fromPath(params.uk_metadata) + clean_geography_cog_uk(uk_fasta, uk_metadata) +} diff --git a/workflows/modules/deduplicate.nf b/workflows/modules/deduplicate.nf new file mode 100644 index 0000000..7cc7397 --- /dev/null +++ b/workflows/modules/deduplicate.nf @@ -0,0 +1,208 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dev = file(params.publish_dev) + + +process annotate_with_unmapped_genome_completeness { + /** + * Adds a column to metadata with proportion of genome which is complete + * @input fasta, metadata + * @output metadata + */ + + input: + path fasta + path metadata + + output: + path "${metadata.baseName}.annotated.csv" + + script: + """ + $project_dir/../bin/annotate_with_unmapped_genome_completeness.py \ + --in-fasta ${fasta} \ + --in-metadata ${metadata} \ + --out-metadata "${metadata.baseName}.annotated.csv" + + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.annotated.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${metadata.baseName}.annotated.csv" | wc -l) + exit 1 + fi + """ +} + +process uk_remove_duplicates_COGID_by_proportionN { + /** + * Where duplicate COGID, keeps the most complete + * @input uk_fasta, uk_metadata + * @output uk_fasta_updated, uk_metadata_updated + */ + + input: + path uk_fasta + path uk_metadata + + output: + path "${uk_fasta.baseName}.deduplicated_by_cogid.fa", emit: uk_fasta_updated + path "${uk_metadata.baseName}.deduplicated_by_cogid.csv", emit: uk_metadata_updated + + script: + """ + $project_dir/../bin/uk_remove_duplicates_COGID_by_proportionN.py \ + --in-fasta ${uk_fasta} \ + --in-metadata ${uk_metadata} \ + --out-fasta "${uk_fasta.baseName}.deduplicated_by_cogid.fa" \ + --out-metadata "${uk_metadata.baseName}.deduplicated_by_cogid.csv" + + if [[ \$(cat "${uk_metadata}" | wc -l) != \$(cat "${uk_metadata.baseName}.deduplicated_by_cogid.csv" | wc -l) ]] + then + echo \$(cat "${uk_metadata}" | wc -l) + echo \$(cat "${uk_metadata.baseName}.deduplicated_by_cogid.csv" | wc -l) + exit 1 + fi + """ +} + + +process remove_duplicates_by_date { + /** + * Where duplicate sequence_name, keeps the earliest + * @input fasta, metadata + * @output fasta_updated, metadata_updated + */ + + memory { 1.GB * task.attempt + metadata.size() * 2.B } + + input: + path fasta + path metadata + + output: + path "${fasta.baseName}.deduplicated.fa", emit: fasta_updated + path "${metadata.baseName}.deduplicated.csv", emit: metadata_updated + + script: + """ + $project_dir/../bin/remove_duplicates_by_date.py \ + --in-fasta ${fasta} \ + --in-metadata ${metadata} \ + --out-fasta "${fasta.baseName}.deduplicated.fa" \ + --out-metadata "${metadata.baseName}.deduplicated.csv" + + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.deduplicated.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${metadata.baseName}.deduplicated.csv" | wc -l) + exit 1 + fi + """ +} + + +process unify_headers { + input: + path fasta + path metadata + + output: + path "${fasta.baseName}.UH.fa" + + script: + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import csv + + alignment = SeqIO.index("${fasta}", "fasta") + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${fasta.baseName}.UH.fa", "w") as fasta_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + id_key = "fasta_header" + if "edin_header" in reader.fieldnames: + id_key = "edin_header" + for row in reader: + if row["why_excluded"]: + print("excluded") + continue + if row[id_key] in alignment: + record = alignment[row[id_key]] + fasta_out.write(">" + row["sequence_name"] + "\\n") + fasta_out.write(str(record.seq) + "\\n") + else: + print(id_key, row[id_key]) + """ +} + + +process uk_label_sourceid_duplicates_to_omit { + /** + * Where duplicate source_id, labels all but the earliest as duplicates + * @input uk_fasta, uk_metadata + * @output uk_fasta_updated, uk_metadata_updated + */ + + publishDir "${publish_dev}/cog_gisaid/", pattern: "*.log", mode: 'copy' + + input: + path uk_metadata + + output: + path "${uk_metadata.baseName}.deduplicated_by_sourceid.csv", emit: uk_metadata_updated + path "deduplicated_by_sourceid.log", emit: deduplicate_log + + script: + """ + $project_dir/../bin/uk_label_sourceid_duplicates_to_omit.py \ + --in-metadata ${uk_metadata} \ + --out-metadata "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" + + if [[ \$(cat "${uk_metadata}" | wc -l) != \$(cat "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" | wc -l) ]] + then + echo \$(cat "${uk_metadata}" | wc -l) + echo \$(cat "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" | wc -l) + exit 1 + fi + """ +} + + +workflow deduplicate_cog_uk { + take: + uk_fasta + uk_metadata + main: + annotate_with_unmapped_genome_completeness(uk_fasta, uk_metadata) + uk_remove_duplicates_COGID_by_proportionN(uk_fasta, annotate_with_unmapped_genome_completeness.out) + unify_headers(uk_remove_duplicates_COGID_by_proportionN.out.uk_fasta_updated, uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated) + uk_label_sourceid_duplicates_to_omit(uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated) + emit: + fasta = unify_headers.out + metadata = uk_label_sourceid_duplicates_to_omit.out.uk_metadata_updated +} + + +workflow deduplicate_gisaid { + take: + gisaid_fasta + gisaid_metadata + main: + annotate_with_unmapped_genome_completeness(gisaid_fasta, gisaid_metadata) + remove_duplicates_by_date(gisaid_fasta, annotate_with_unmapped_genome_completeness.out) + unify_headers(remove_duplicates_by_date.out.fasta_updated, remove_duplicates_by_date.out.metadata_updated) + emit: + fasta = unify_headers.out + metadata = remove_duplicates_by_date.out.metadata_updated +} + + +workflow { + uk_fasta = file(params.uk_fasta) + uk_metadata = file(params.uk_metadata) + deduplicate_cog_uk(uk_fasta, uk_metadata) +} diff --git a/workflows/modules/filter_and_trim.nf b/workflows/modules/filter_and_trim.nf new file mode 100644 index 0000000..e454587 --- /dev/null +++ b/workflows/modules/filter_and_trim.nf @@ -0,0 +1,242 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dev = file(params.publish_dev) + + +process filter_low_coverage_sequences { + /** + * Keeps only sequences with completeness greater than min_covg threshold + * @input alignment, metadata + * @output alignment_updated, metadata_updated + * @params min_covg + */ + + input: + path alignment + path metadata + + output: + path "${alignment.baseName}.low_covg_filtered.fasta", emit: fasta_updated + path "${metadata.baseName}.low_covg_filtered.csv", emit: metadata_updated + + script: + if (!params.min_covg) + """ + mv "${alignment}" "${alignment.baseName}.low_covg_filtered.fasta" + mv "${metadata}" "${metadata.baseName}.low_covg_filtered.csv" + """ + else + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import csv + + alignment = SeqIO.index("${alignment}", "fasta") + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.low_covg_filtered.csv", 'w', newline = '') as csv_out, \ + open("${alignment.baseName}.low_covg_filtered.fasta", 'w') as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + if row["why_excluded"]: + writer.writerow(row) + continue + id = row["sequence_name"] + if id in alignment: + seq = str(alignment[id].seq) + mapped_completeness = float(len(seq.replace("N", "")) / len(seq)) + if mapped_completeness >= float(${params.min_covg} / 100): + writer.writerow(row) + fasta_out.write(">" + id + "\\n") + fasta_out.write(seq + "\\n") + else: + row["why_excluded"] = "low mapped_completeness" + writer.writerow(row) + """ +} + + +process trim_alignment { + /** + * Trims start and end of alignment + * @input alignment + * @output alignment_updated + * @params trim_start, trim_end + */ + + input: + path alignment + + output: + path "${alignment.baseName}.trimmed.fa" + + script: + if (params.trim_start && params.trim_end) + """ + #!/usr/bin/env python3 + from Bio import SeqIO + + strt = int(${params.trim_start}) + stp = int(${params.trim_end}) + + with open("${alignment}", "r") as fasta_in, \ + open("${alignment.baseName}.trimmed.fa", "w") as fasta_out: + + for record in SeqIO.parse(fasta_in, "fasta"): + seq = str(record.seq).upper() + new_seq = ("N" * strt) + seq[strt:stp] + ("N" * (len(seq) - stp)) + fasta_out.write(">" + record.id + "\\n") + fasta_out.write(new_seq + "\\n") + """ + else + """ + mv "${alignment.baseName}" "${alignment.baseName}.trimmed.fa" + """ +} + + +process distance_QC { + /** + * Outputs number of sequences per country + * @input fasta, metadata + * @output "QC_distances.tsv" + */ + publishDir "${publish_dev}", pattern: "*/*.tsv", mode: 'copy' + + + input: + path fasta + path metadata + val category + + output: + path "${category}/${category}_QC_distances.tsv" + + script: + """ + datafunk distance_to_root \ + --input-fasta ${fasta} \ + --input-metadata ${metadata} + + mkdir -p ${category} + mv distances.tsv "${category}/${category}_QC_distances.tsv" + """ +} + + +process filter_on_distance_to_WH04 { + /** + * Restricts to samples within distance x of WH04 + * @input fasta, metadata, distances + * @output + */ + + input: + path fasta + path metadata + path distances + + output: + path "${fasta.baseName}.distance_filtered.fa", emit: fasta + path "${metadata.baseName}.distance_filtered.csv", emit: metadata + + script: + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import csv + + reject = set() + with open("${distances}", 'r', newline = '') as distances_in: + reader = csv.DictReader(distances_in, delimiter="\t", quotechar='\"', dialect = "unix") + for row in reader: + sequence_name = row['sequence_name'] + distance = float(row['distance_stdevs']) + if distance >= 4.0: + reject.add(sequence_name) + + alignment = SeqIO.index("${fasta}", "fasta") + + with open("${metadata}", 'r', newline = '') as csv_in, \ + open("${metadata.baseName}.distance_filtered.csv", 'w', newline = '') as csv_out, \ + open("${fasta.baseName}.distance_filtered.fa", 'w') as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + if row["why_excluded"]: + writer.writerow(row) + continue + id = row["sequence_name"] + if id in reject: + row["why_excluded"] = "distance to WH04 more than 4.0 epi-week std devs" + writer.writerow(row) + continue + if id in alignment: + writer.writerow(row) + seq = str(alignment[id].seq) + fasta_out.write(">" + id + "\\n") + fasta_out.write(seq + "\\n") + """ +} + + +workflow filter_and_trim_gisaid { + take: + gisaid_fasta + gisaid_metadata + main: + filter_low_coverage_sequences(gisaid_fasta, gisaid_metadata) + trim_alignment(filter_low_coverage_sequences.out.fasta_updated) + if ( params.distance_qc ){ + distance_QC(trim_alignment.out, filter_low_coverage_sequences.out.metadata_updated, "gisaid") + filter_on_distance_to_WH04(gisaid_fasta, gisaid_metadata, distance_QC.out) + ch_fasta = filter_on_distance_to_WH04.out.fasta + ch_metadata = filter_on_distance_to_WH04.out.metadata + } else { + ch_fasta = trim_alignment.out + ch_metadata = filter_low_coverage_sequences.out.metadata_updated + } + emit: + fasta = ch_fasta + metadata = ch_metadata +} + + +workflow filter_and_trim_cog_uk { + take: + uk_fasta + uk_metadata + main: + filter_low_coverage_sequences(uk_fasta, uk_metadata) + trim_alignment(filter_low_coverage_sequences.out.fasta_updated) + if ( params.distance_qc ){ + distance_QC(trim_alignment.out, filter_low_coverage_sequences.out.metadata_updated, "cog") + filter_on_distance_to_WH04(uk_fasta, uk_metadata, distance_QC.out) + ch_fasta = filter_on_distance_to_WH04.out.fasta + ch_metadata = filter_on_distance_to_WH04.out.metadata + } else { + ch_fasta = trim_alignment.out + ch_metadata = filter_low_coverage_sequences.out.metadata_updated + } + emit: + fasta = ch_fasta + metadata = ch_metadata +} + +workflow { + uk_fasta = file(params.uk_fasta) + uk_metadata = file(params.uk_metadata) + + filter_and_trim_cog_uk(uk_fasta, + uk_metadata) +} diff --git a/workflows/modules/pangolin.nf b/workflows/modules/pangolin.nf new file mode 100644 index 0000000..a1a814c --- /dev/null +++ b/workflows/modules/pangolin.nf @@ -0,0 +1,342 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dir = file(params.publish_dir) + + +process check_for_pangolin_update { + /** + * Checks if there is a new version of pangolin and sets param flag if there is + */ + output: + env PANGOLIN_UPDATED + + script: + if ( params.auto_update_pangolin ) + """ + PANGO_VERSION=\$(pangolin --all-versions) + echo \$PANGO_VERSION + pangolin --update + sleep 5s + NEW_PANGO_VERSION=\$(pangolin --all-versions) + echo \$NEW_PANGO_VERSION + if [[ "\$PANGO_VERSION" == "\$NEW_PANGO_VERSION" ]]; then + PANGOLIN_UPDATED=false + else + PANGOLIN_UPDATED=true + fi + """ + else + """ + PANGOLIN_UPDATED=false + """ + +} + + +process extract_sequences_for_pangolin { + /** + * If update_all_lineage_assignments flag set, or no previous provided, outputs the input files. + * Otherwise, extracts lineageless sequences from FASTA to run pangolin on, and updates + * metadata with previous lineages + * @input fasta, metadata + * @output pangolin_fasta, metadata_with_previous + * @params previous_metadata, update_all_lineage_assignments + */ + memory {task.attempt * 6.GB} + + input: + path fasta + path metadata + env PANGOLIN_UPDATED + + output: + path "${fasta.baseName}.for_pangolin.fa", emit: pangolin_fasta + path "${metadata.baseName}.with_previous.csv", emit: metadata_with_previous + + script: + if (params.update_all_lineage_assignments || !params.previous_metadata ) + """ + mv "${fasta}" "${fasta.baseName}.for_pangolin.fa" + mv "${metadata}" "${metadata.baseName}.with_previous.csv" + """ + else + """ + echo "Pangolin updated: \$PANGOLIN_UPDATED" + if [ \$PANGOLIN_UPDATED == "true" ] + then + mv "${fasta}" "${fasta.baseName}.for_pangolin.fa" + mv "${metadata}" "${metadata.baseName}.with_previous.csv" + else + $project_dir/../bin/prepare_for_pangolin.py \ + --in-fasta ${fasta} \ + --in-metadata ${metadata} \ + --previous-metadata ${params.previous_metadata} \ + --out-fasta "${fasta.baseName}.for_pangolin.fa" \ + --out-metadata "${metadata.baseName}.with_previous.csv" + if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.with_previous.csv" | wc -l) ]] + then + echo \$(cat "${metadata}" | wc -l) + echo \$(cat "${metadata.baseName}.with_previous.csv" | wc -l) + exit 1 + fi + fi + """ +} + +process run_pangolin { + /** + * Runs PANGOLIN on input fasta + * @input fasta + * @output pangolin_fasta + */ + cpus 4 + memory { task.attempt * 8.GB } + + input: + path fasta + + output: + path "pangolin/lineage_report.csv", emit: report + //path "pangolin/sequences.aln.fasta", emit: alignment + + script: + if (params.skip_designation_hash) + """ + pangolin "${fasta}" \ + --outdir pangolin \ + --tempdir pangolin_tmp \ + --alignment \ + --analysis-mode fast \ + --skip-designation-hash \ + -t ${task.cpus} + """ + else + """ + pangolin "${fasta}" \ + --outdir pangolin \ + --tempdir pangolin_tmp \ + --alignment \ + --analysis-mode fast \ + -t ${task.cpus} + """ +} + +process run_pangolin_usher { + /** + * Runs PANGOLIN on input fasta + * @input fasta + * @output pangolin_fasta + */ + + cpus 16 + + input: + path fasta + + output: + path "pangolin/usher_lineage_report.csv" + + script: + if (params.skip_designation_hash) + """ + pangolin "${fasta}" \ + --outdir pangolin \ + --tempdir pangolin_tmp \ + --outfile usher_lineage_report.csv \ + --usher \ + -t ${task.cpus} \ + --skip-designation-hash + """ + else + """ + pangolin "${fasta}" \ + --outdir pangolin \ + --tempdir pangolin_tmp \ + --outfile usher_lineage_report.csv \ + --usher -t ${task.cpus} + """ +} + +process add_new_pangolin_lineages_to_metadata { + /** + * Updates metadata with new PANGOLIN lineage assignments + * @input metadata, pangolin_csv + * @output metadata_updated + */ + + memory { task.attempt * metadata.size() * 3.B } + + input: + path metadata + path pangolin_csv + + output: + path "${metadata.baseName}.with_pangolin.csv", emit: metadata + path "pango.log", emit: log + + script: + """ + $project_dir/../bin/prepare_for_pangolin.py \ + --in-metadata ${metadata} \ + --previous-metadata ${pangolin_csv} \ + --out-metadata "${metadata.baseName}.with_pangolin.csv" + """ +} + +process add_pangolin_usher_to_metadata { + /** + * Adds usher pangolin calls to metadata + * @input metadata, usher report + * @output metadata + */ + + input: + path metadata + path usher_report + + output: + path "${metadata.baseName}.with_usher.csv" + + script: + """ + fastafunk add_columns \ + --in-metadata ${metadata} \ + --in-data ${usher_report} \ + --index-column taxon \ + --join-on taxon \ + --new-columns usher_lineage usher_lineages_version \ + --where-column usher_lineage=lineage usher_lineages_version=version \ + --out-metadata "${metadata.baseName}.with_usher.csv" + """ +} + +process cache_lineages_report { + /** + * Creates a map from sequence hash to pangolin report calls + * @input metadata + * @output metadata + */ + publishDir "${publish_dir}/pangolin", pattern: "*.cache.csv", mode: 'copy' + + input: + path fasta + path metadata + + output: + path "${metadata.baseName}.cache.csv", emit: metadata + + script: + """ + $project_dir/../bin/cache_pangolin_report.py \ + --in-fasta ${fasta} \ + --in-metadata ${metadata} \ + --out-metadata "${metadata.baseName}.cache.csv" + """ +} + + +process announce_summary { + /** + * Summarizes pangolin into JSON + * @input fastas + */ + + input: + path pango_input + path pango_log + + output: + path "announce.json" + + script: + if (params.webhook) + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Finished running pangolin ${params.date}*\\n" >> announce.json + echo "> Number of sequences input to pangolin for new lineage assignments : \$(cat ${pango_input} | grep '>' | wc -l)\\n" >> announce.json + echo "> \$(cat ${pango_log})\\n" >> announce.json + echo '"}' >> announce.json + + echo 'webhook ${params.webhook}' + + curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} + """ + else + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Finished running pangolin ${params.date}*\\n" >> announce.json + echo "> Number of sequences input to pangolin for new lineage assignments : \$(cat ${pango_input} | grep '>' | wc -l)\\n" >> announce.json + echo "> \$(cat ${pango_log})\\n" >> announce.json + echo '"}' >> announce.json + """ +} + + +process publish_metadata { + /** + * Publishes metadata csv for this category + * @input metadata + * @output metadata + */ + + publishDir "${publish_dir}", pattern: "*/*.csv", mode: 'copy' + + input: + path metadata + val category + + output: + path "${category}/pangolin_master.csv" + + script: + """ + mkdir -p ${category} + cp ${metadata} ${category}/pangolin_master.csv + """ +} + +workflow pangolin { + take: + in_fasta + in_metadata + pangolin_updated + main: + extract_sequences_for_pangolin(in_fasta, in_metadata, pangolin_updated) + extract_sequences_for_pangolin.out.pangolin_fasta.splitFasta( by: params.chunk_size, file: true ) + .set{ pangolin_chunks } + run_pangolin(pangolin_chunks) + run_pangolin.out.report.collectFile(newLine: true, keepHeader: true, skip: 1) + .set{ pangolin_result } + if (params.add_usher_pangolin) { + run_pangolin_usher(pangolin_chunks) + run_pangolin_usher.out.collectFile(newLine: true, keepHeader: true, skip: 1) + .set{ pangolin_usher_result } + add_pangolin_usher_to_metadata(pangolin_result, pangolin_usher_result) + post_pangolin_metadata = add_pangolin_usher_to_metadata.out + } else { + post_pangolin_metadata = pangolin_result + } + add_new_pangolin_lineages_to_metadata(extract_sequences_for_pangolin.out.metadata_with_previous, post_pangolin_metadata) + + if (params.cache_pangolin){ + cache_lineages_report(in_fasta, post_pangolin_metadata) + } + + announce_summary(extract_sequences_for_pangolin.out.pangolin_fasta, add_new_pangolin_lineages_to_metadata.out.log) + emit: + metadata = add_new_pangolin_lineages_to_metadata.out.metadata + report = post_pangolin_metadata +} + + +workflow { + uk_fasta = file(params.uk_fasta) + uk_metadata = file(params.uk_metadata) + check_for_pangolin_update() + + pangolin(uk_fasta, uk_metadata, check_for_pangolin_update.out) + publish_metadata(pangolin.out.report, "pangolin") +} diff --git a/workflows/modules/preprocess_cog_uk.nf b/workflows/modules/preprocess_cog_uk.nf new file mode 100644 index 0000000..9ba6caa --- /dev/null +++ b/workflows/modules/preprocess_cog_uk.nf @@ -0,0 +1,372 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir + + +process uk_strip_header_digits_and_unalign { + /** + * Strips extra header info from FASTA, removed '-' from sequence + * @input uk_fasta + * @output uk_fasta_updated + */ + + input: + path uk_fasta + + output: + path "${uk_fasta.baseName}.header_stripped.fasta" + + script: + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import re + def is_iupac(strg, search=re.compile(r'[^ACGTRYSWKMBDHVNacgtryswkmbdhvn-]').search): + return not bool(search(strg)) + + fasta_in = SeqIO.parse("${uk_fasta}", "fasta") + with open("${uk_fasta.baseName}.header_stripped.fasta", 'w') as f: + for record in fasta_in: + seq = str(record.seq).replace('-','') + seq = seq.replace('?','N') + if not is_iupac(seq): + continue + ID = record.description.split("|")[0] + f.write(">" + ID + "\\n") + f.write(seq + "\\n") + """ +} + +process uk_add_published_date_to_metadata { + /** + * Takes the MAJORA TSV of metadata and adds the published_data parameter from + * majora.pag_lookup.tsv + * @input uk_metadata, uk_pag_metadata + * @output uk_metadata_updated_date + */ + + input: + path uk_updated_metadata + path uk_metadata_pag + + output: + path "${uk_updated_metadata.baseName}.pag.csv" + + script: + """ + fastafunk add_columns \ + --in-metadata ${uk_updated_metadata} \ + --in-data ${uk_metadata_pag} \ + --index-column central_sample_id \ + --join-on central_sample_id \ + --force-overwrite \ + --new-columns published_date \ + --out-metadata "${uk_updated_metadata.baseName}.pag.csv" + """ +} + +process uk_anonymise_ids { + /** + If on or after 30th June 2023, replace central ID + for anonymous ID, if they are present. + @input uk_metadata + @output uk_metadata_anon + */ + + input: + path uk_metadata + + output: + path "${uk_metadata.baseName}.anon.tsv" + + script: + """ + #!/usr/bin/env python3 + import datetime + import csv + + anon_samp_id_date = datetime.datetime(2023, 6, 30).date() + + with open("${uk_metadata}", 'r', newline = '') as csv_in, open("${uk_metadata.baseName}.anon.tsv", 'w', newline = '') as csv_out: + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix", delimiter="\t") + writer.writeheader() + + for row in reader: + if datetime.datetime.strptime(row["published_date"], "%Y-%m-%d").date() >= anon_samp_id_date: + if row["anonymous_sample_id"]: + row["central_sample_id"] = row["anonymous_sample_id"] + writer.writerow(row) + """ +} + + +process uk_add_columns_to_metadata { + /** + * Takes the MAJORA TSV of metadata and adds/updates columns for sample_date, pillar_2, + * sequence_name, covv_accession_id, edin_epi_week, edin_epi_day and adm0 + * @input uk_metadata + * @output uk_metadata_updated + * @params uk_accessions, uk_updated_dates + */ + + input: + path uk_metadata + path uk_accessions + path uk_updated_dates + + output: + path "${uk_metadata.baseName}.updated.csv" + + script: + """ + $project_dir/../bin/add_to_uk_metadata.py \ + --in-metadata ${uk_metadata} \ + --out-metadata ${uk_metadata.baseName}.updated.csv \ + --accession-file ${uk_accessions} \ + --updated-date-file ${uk_updated_dates} + """ +} + + +process uk_filter_omitted_sequences { + /** + * Takes a FASTA and METADATA and excludes samples specified in an exclusion file + * sequence_name, covv_accession_id, edin_epi_week, edin_epi_day and adm0 + * @input uk_fasta, uk_metadata + * @output uk_fasta_updated, uk_metadata_updated + * @params uk_omissions + */ + input: + path uk_fasta + path uk_metadata + path uk_omissions + + output: + path "${uk_fasta.baseName}.omit_filtered.fa", emit: fasta + path "${uk_metadata.baseName}.omit_filtered.csv", emit: metadata + + script: + if ( params.uk_omissions ) + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import csv + + alignment = SeqIO.index("${uk_fasta}", "fasta") + + omissions = set() + with open("${uk_omissions}", "r") as f: + for line in f: + omissions.add(line.rstrip()) + + with open("${uk_metadata}", 'r', newline = '') as csv_in, \ + open("${uk_metadata.baseName}.omit_filtered.csv", 'w', newline = '') as csv_out, \ + open("${uk_fasta.baseName}.omit_filtered.fa", "w") as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + if row["central_sample_id"] in omissions: + row["why_excluded"] = "central_sample_id in omissions_file" + writer.writerow(row) + continue + + if row["fasta_header"] not in alignment: + row["why_excluded"] = "sequences was missing from input or contained non-IUPAC characters" + writer.writerow(row) + continue + + record = alignment[row["fasta_header"]] + writer.writerow(row) + fasta_out.write(">" + record.id + "\\n") + fasta_out.write(str(record.seq) + "\\n") + """ + else + """ + mv "${uk_fasta}" "${uk_fasta.baseName}.omit_filtered.fa" + mv "${uk_metadata}" "${uk_metadata.baseName}.omit_filtered.csv" + """ +} + +process uk_filter_on_sample_date { + /** + * If a time window (in days) is provided, excludes samples from FASTA and + * METADATA files which do not fall within X days of date + * @input uk_fasta, uk_metadata + * @output uk_fasta_update, uk_metadata_updated + * @params time_window, date + */ + + input: + path uk_fasta + path uk_metadata + + output: + path "${uk_fasta.baseName}.date_filtered.fa", emit: fasta + path "${uk_metadata.baseName}.date_filtered.csv", emit: metadata + + script: + if ( params.time_window && params.date) + """ + #!/usr/bin/env python3 + import datetime + from Bio import SeqIO + import csv + + indexed_fasta = SeqIO.index("${uk_fasta}", "fasta") + + window = datetime.timedelta(int("${params.time_window}")) + todays_date = datetime.datetime.strptime("${params.date}", '%Y-%m-%d').date() + + with open"${uk_metadata}", 'r', newline = '') as csv_in, \ + open("${uk_metadata.baseName}.date_filtered.csv", 'w', newline = '') as csv_out, \ + open("${uk_fasta.baseName}.date_filtered.fa", "w") as fasta_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + try: + date = datetime.datetime.strptime(row["sample_date"], '%Y-%m-%d').date() + except: + row["why_excluded"] = "no sample_date" + writer.writerow(row) + + if (todays_date - window) > date: + row["why_excluded"] = "sample_date older than %s days" %window + writer.writerow(row) + continue + + if row["fasta_header"] not in indexed_fasdta: + row["why_excluded"] = "sequences was missing from input or contained non-IUPAC characters" + writer.writerow(row) + continue + + writer.writerow(row) + + seq_rec = indexed_fasta[row["fasta_header"]] + fasta_out.write(">" + seq_rec.id + "\\n") + fasta_out.write(str(seq_rec.seq) + "\\n") + """ + else + """ + mv "${uk_fasta}" "${uk_fasta.baseName}.date_filtered.fa" + mv "${uk_metadata}" "${uk_metadata.baseName}.date_filtered.csv" + """ +} + + +process add_previous_uk_lineage_to_metadata { + /** + * Adds uk_lineage where previously assigned + * @input metadata + * @output metadata + */ + + memory { 2.GB * task.attempt + metadata.size() * 2.B } + + input: + path metadata + + output: + path "${metadata.baseName}.with_uk_lineage.csv" + + script: + if ( !params.previous_metadata ) + """ + mv ${metadata} "${metadata.baseName}.with_uk_lineage.csv" + """ + else + """ + fastafunk add_columns \ + --in-metadata ${metadata} \ + --in-data ${params.previous_metadata} \ + --index-column sequence_name \ + --join-on sequence_name \ + --new-columns uk_lineage \ + --out-metadata "${metadata.baseName}.with_uk_lineage.csv" + """ +} + + +process announce_summary { + /** + * Summarizes preprocess into JSON + * @input fastas + */ + + input: + path original + path strip_header + path filter_omitted_sequences + path filter_on_sample_date + + output: + path "announce.json" + + script: + if (params.webhook) + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Preprocessing COG input ${params.date}*\\n" >> announce.json + echo "> Number of sequences in COG input files : \$(cat ${original} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after header stripped : \$(cat ${strip_header} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after filtering omitted: \$(cat ${filter_omitted_sequences} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after filtering by sample date with time window ${params.time_window}: \$(cat ${filter_on_sample_date} | grep '>' | wc -l)\\n" >> announce.json + echo '"}' >> announce.json + + echo 'webhook ${params.webhook}' + + curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} + """ + else + """ + echo '{"text":"' > announce.json + echo "*${params.whoami}: Preprocessing COG input ${params.date}*\\n" >> announce.json + echo "> Number of sequences in COG input files : \$(cat ${original} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after header stripped : \$(cat ${strip_header} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after filtering omitted: \$(cat ${filter_omitted_sequences} | grep '>' | wc -l)\\n" >> announce.json + echo "> Number of sequences after filtering by sample date with time window ${params.time_window}: \$(cat ${filter_on_sample_date} | grep '>' | wc -l)\\n" >> announce.json + echo '"}' >> announce.json + """ +} + +uk_updated_dates = file(params.uk_updated_dates) +uk_omissions = file(params.uk_omissions) + +workflow preprocess_cog_uk { + take: + uk_fasta + uk_metadata + uk_accessions + uk_pag + main: + uk_strip_header_digits_and_unalign(uk_fasta) + uk_add_published_date_to_metadata(uk_metadata, uk_pag) + uk_anonymise_ids(uk_add_published_date_to_metadata.out) + uk_add_columns_to_metadata(uk_anonymise_ids.out, uk_accessions, uk_updated_dates) + uk_filter_omitted_sequences(uk_strip_header_digits_and_unalign.out, uk_add_columns_to_metadata.out, uk_omissions) + uk_filter_on_sample_date(uk_filter_omitted_sequences.out.fasta, uk_filter_omitted_sequences.out.metadata) + add_previous_uk_lineage_to_metadata(uk_filter_omitted_sequences.out.metadata) + announce_summary(uk_fasta, uk_strip_header_digits_and_unalign.out, uk_filter_omitted_sequences.out.fasta, uk_filter_on_sample_date.out.fasta) + emit: + fasta = uk_filter_on_sample_date.out.fasta + metadata = add_previous_uk_lineage_to_metadata.out +} + + +workflow { + uk_fasta = file(params.uk_fasta) + uk_metadata = file(params.uk_metadata) + uk_accessions = file(params.uk_accessions) + + preprocess_cog_uk(uk_fasta, + uk_metadata, + uk_accessions) +} diff --git a/workflows/modules/preprocess_gisaid.nf b/workflows/modules/preprocess_gisaid.nf new file mode 100644 index 0000000..f37ac72 --- /dev/null +++ b/workflows/modules/preprocess_gisaid.nf @@ -0,0 +1,96 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir + + +process gisaid_process_json { + /** + * Downloads + * @input json + * @output gisaid_fasta, gisaid_metadata + * @params gisaid_omissions + */ + + input: + path json + + output: + path "gisaid.fasta", emit: fasta + path "gisaid.csv", emit: metadata + + script: + """ + datafunk process_gisaid_data \ + --input-json ${json} \ + --input-metadata False \ + --exclude-file ${gisaid_omissions} \ + --output-fasta "gisaid.fasta" \ + --output-metadata "gisaid.csv" \ + --exclude-undated + """ +} + + +process gisaid_add_columns_to_metadata { + input: + path gisaid_fasta + path gisaid_metadata + + output: + path "${gisaid_metadata.baseName}.add_metadata.csv" + + script: + """ + #!/usr/bin/env python3 + from Bio import SeqIO + import csv + + alignment = SeqIO.index("${gisaid_fasta}", "fasta") + + with open("${gisaid_metadata}", 'r', newline = '') as csv_in, \ + open("${gisaid_metadata.baseName}.add_metadata.csv", 'w', newline = '') as csv_out: + + reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") + writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ['sequence_name', 'why_excluded'], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") + writer.writeheader() + + for row in reader: + edin_header = row["edin_header"] + new_header = edin_header.split("|")[0] + row['sequence_name'] = new_header + if edin_header not in alignment: + row['why_excluded'] = "filtered during loading from JSON" + elif row["edin_epi_day"] == '': + row['why_excluded'] = "no date" + else: + row['why_excluded'] = "" + writer.writerow(row) + """ +} + + +gisaid_omissions = file(params.gisaid_omissions) + +workflow preprocess_gisaid { + take: + gisaid_json + main: + gisaid_json.splitText( by: params.chunk_size, file: true ).set{ json_chunks } + gisaid_process_json(json_chunks) + gisaid_add_columns_to_metadata(gisaid_process_json.out.fasta, gisaid_process_json.out.metadata) + gisaid_process_json.out.fasta.collectFile(newLine: true).set{ fasta_result } + gisaid_add_columns_to_metadata.out.collectFile(newLine: false, keepHeader: true, skip: 1) + .set{ metadata_result } + emit: + fasta = fasta_result + metadata = metadata_result +} + + +workflow { + gisaid_json = file(params.gisaid_json) + + preprocess_gisaid(gisaid_json) +} diff --git a/workflows/modules/publish_all.nf b/workflows/modules/publish_all.nf new file mode 100644 index 0000000..90e8ee0 --- /dev/null +++ b/workflows/modules/publish_all.nf @@ -0,0 +1,427 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +project_dir = projectDir +publish_dir = file(params.publish_dir) +publish_dev = file(params.publish_dev) + + +process combine_cog_gisaid { + /** + * Combines FASTA and METADATA for COG-UK and GISAID + * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata + * @output cog_gisaid_fasta, cog_gisaid_metadata + */ + + publishDir "${publish_dev}/cog_gisaid", pattern: "*.fa", mode: 'copy' + publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_master.csv"} + + input: + path uk_fasta + path uk_metadata + path gisaid_fasta + path gisaid_metadata + + output: + path "cog_gisaid.fa", emit: fasta + path "cog_gisaid.csv", emit: metadata + + script: + """ + fastafunk fetch \ + --in-fasta ${uk_fasta} \ + --in-metadata ${uk_metadata} \ + --index-column sequence_name \ + --filter-column fasta_header covv_accession_id central_sample_id biosample_source_id secondary_identifier root_sample_id source_id \ + sequence_name sample_date safe_sample_date epi_week epi_day collection_date received_date published_date \ + country adm1 adm1_raw adm1_UK adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ + is_uk is_cog_uk \ + submission_org_code submission_user collection_pillar is_pillar_2 is_surveillance is_community is_hcw \ + is_travel_history travel_history \ + lineage lineages_version lineage_conflict lineage_ambiguity_score scorpio_call scorpio_support scorpio_conflict \ + usher_lineage usher_lineages_version \ + source_age source_sex sample_type_collected sample_type_received swab_site \ + ct_n_ct_value ct_n_test_kit ct_n_test_platform ct_n_test_target \ + unmapped_genome_completeness duplicate why_excluded nucleotide_mutations \ + uk_lineage microreact_lineage del_lineage del_introduction phylotype \ + --where-column epi_week=edin_epi_week epi_day=edin_epi_day country=adm0 lineage_support=probability lineages_version=pangoLEARN_version adm1_UK=adm1_raw published_date=sequencing_submission_date \ + --out-fasta "intermediate_cog.fa" \ + --out-metadata "intermediate_cog.csv" \ + --restrict --low-memory + + fastafunk fetch \ + --in-fasta ${gisaid_fasta} \ + --in-metadata ${gisaid_metadata} \ + --index-column sequence_name \ + --filter-column fasta_header covv_accession_id central_sample_id biosample_source_id secondary_identifier root_sample_id source_id \ + sequence_name sample_date safe_sample_date epi_week epi_day collection_date received_date published_date \ + country adm1 adm1_raw adm1_UK adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ + is_uk is_cog_uk \ + submission_org_code submission_user collection_pillar is_pillar_2 is_surveillance is_community is_hcw \ + is_travel_history travel_history \ + lineage lineages_version lineage_conflict lineage_ambiguity_score scorpio_call scorpio_support scorpio_conflict \ + usher_lineage usher_lineages_version \ + source_age source_sex sample_type_collected sample_type_received swab_site \ + ct_n_ct_value ct_n_test_kit ct_n_test_platform ct_n_test_target \ + unmapped_genome_completeness duplicate why_excluded nucleotide_mutations \ + uk_lineage microreact_lineage del_lineage del_introduction phylotype \ + --where-column adm1=edin_admin_1 travel_history=edin_travel published_date=covv_subm_date\ + --out-fasta "intermediate_gisaid.fa" \ + --out-metadata "intermediate_gisaid.csv" \ + --restrict --low-memory + + cat intermediate_cog.fa intermediate_gisaid.fa > cog_gisaid.fa + cat intermediate_cog.csv > cog_gisaid.csv + tail -n+2 intermediate_gisaid.csv >> cog_gisaid.csv + + head -n1 intermediate_cog.csv > head_cog.txt + head -n1 intermediate_gisaid.csv > head_gisaid.txt + cmp --silent head_cog.txt head_gisaid.txt || exit 1 + """ +} + + +process combine_mutations { + /** + * Combines FASTA and mutation metadata for COG-UK and GISAID + * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata + * @output cog_gisaid_fasta, cog_gisaid_metadata + */ + + publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_mutations.csv"} + + input: + path uk_mutations + path gisaid_mutations + + output: + path "cog_gisaid_mutations.csv" + + script: + """ + fastafunk merge \ + --in-metadata ${uk_mutations} ${gisaid_mutations} \ + --out-metadata "cog_gisaid_mutations.csv" \ + --index-column "sequence_name" + """ +} + +process combine_constellations { + /** + * Combines FASTA and constellation metadata for COG-UK and GISAID + * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata + * @output cog_gisaid_fasta, cog_gisaid_metadata + */ + + publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_constellations.csv"} + + input: + path uk_constellations + path gisaid_constellations + + output: + path "cog_gisaid_constellations.csv" + + script: + """ + fastafunk merge \ + --in-metadata ${uk_constellations} ${gisaid_constellations} \ + --out-metadata "cog_gisaid_constellations.csv" \ + --index-column "sequence_name" + """ +} + +process combine_updown { + /** + * Combines updown metadata for COG-UK and GISAID + * @input uk_updown gisaid_updown + * @output cog_gisaid_updown + */ + + publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_updown.csv"} + + input: + path uk_updown + path gisaid_updown + + output: + path "cog_gisaid_updown.csv" + + script: + """ + cp ${uk_updown} tmp.csv + tail -n+1 ${gisaid_updown} >> tmp.csv + grep -v ",,,," tmp.csv > "cog_gisaid_updown.csv" + """ +} + + +process split_recipes { + input: + path recipes + + output: + path "*.json" + + script: + """ + #!/usr/bin/env python3 + import json + i = 0 + + with open("${recipes}", 'r') as f: + recipes = json.load(f) + + for d in recipes: + for entry in recipes[d]: + new_recipes = {d:[entry]} + with open("%i.json" %i, 'w') as handle: + json.dump(new_recipes,handle) + i += 1 + """ +} + + +process publish_cog_global_recipes { + /** + * Publishes subsets of combined FASTA and METADATA for COG-UK and GISAID + * @input uk_unaligned_fasta, uk_aligned_fasta, uk_trimmed_fasta, combined_fasta, + * uk_metadata, combined_metadata, uk_mutations, combined_mutations + * @params publish_recipes.json + * @output many + */ + + publishDir "${publish_dir}/", pattern: "*/*.*", mode: 'copy', overwrite: false + publishDir "${publish_dir}/", pattern: "README", mode: 'copy', overwrite: false + + memory { 1.GB * task.attempt + combined_metadata.size() * 4.B } + errorStrategy = { 'retry' } + maxRetries 3 + + input: + tuple path(uk_unaligned_fasta),path(uk_aligned_fasta),path(uk_trimmed_fasta),path(combined_fasta),path(uk_metadata),path(combined_metadata),path(combined_mutations),path(combined_constellations),path(combined_updown),path(recipe) + + output: + path "${recipe.baseName}.done.txt", emit: flag + path "README", emit: readme + path "public/cog_${params.date}_all.fa", optional: true, emit: fasta + path "public/cog_${params.date}_metadata.csv", optional: true, emit: metadata + path "public/cog_${params.date}_alignment.fa", optional: true, emit: alignment + path "public/cog_${params.date}_unmasked_alignment.fa", optional: true, emit: unmasked_alignment + path "*/cog_*.*", emit: all + + script: + """ + cp $project_dir/../resources/publish_readme.txt README + + $project_dir/../bin/publish_from_config.py \ + --unaligned_fasta ${uk_unaligned_fasta} \ + --aligned_fasta ${uk_aligned_fasta} \ + --trimmed_fasta ${uk_trimmed_fasta} \ + --cog_global_fasta ${combined_fasta} \ + --cog_metadata ${uk_metadata} \ + --cog_global_metadata ${combined_metadata} \ + --mutations ${combined_mutations} \ + --constellations ${combined_constellations} \ + --updown ${combined_updown} \ + --recipes ${recipe} \ + --date ${params.date} + touch "${recipe.baseName}.done.txt" + """ +} + +process publish_s3 { + /** + * Publishes public files to s3 + * @input fasta, metadata, aligment, unmasked_alignment + */ + publishDir "${publish_dev}/", pattern: "s3dir", mode: 'copy' + + input: + path fasta + path metadata + path alignment + path unmasked_alignment + + output: + path s3dir + + + script: + """ + mkdir -p s3dir + cp ${fasta} s3dir/cog_all.fasta + cp ${metadata} s3dir/cog_metadata.csv + cp ${alignment} s3dir/cog_alignment.fasta + cp ${unmasked_alignment} s3dir/cog_unmasked_alignment.fasta + """ +} + + +process publish_gisaid_recipes { + /** + * Publishes subsets of combined FASTA and METADATA for COG-UK and GISAID + * @input gisaid_unaligned_fasta, gisaid_aligned_fasta, gisaid_trimmed_fasta, combined_fasta, + * gisaid_metadata, combined_metadata, gisaid_mutations, combined_mutations + * @params publish_recipes.json + * @output many + */ + + publishDir "${publish_dir}/", pattern: "*/*.*", mode: 'copy', overwrite: false + + memory { 1.GB * task.attempt + gisaid_metadata.size() * 8.B } + errorStrategy = { 'retry' } + maxRetries 3 + + input: + tuple path(gisaid_fasta),path(gisaid_metadata),path(gisaid_mutations),path(gisaid_constellations),path(gisaid_updown),path(recipe) + + output: + path "*/gisaid_*.*", emit: all + path "*/gisaid_*_global_alignment.fa", optional: true, emit: fasta + path "*/gisaid_*_global_metadata.csv", optional: true, emit: metadata + path "*/gisaid_*_global_mutations.csv", optional: true, emit: mutations + path "*/gisaid_*_global_constellations.csv", optional: true, emit: constellations + path "*/gisaid_*_global_updown.csv", optional: true, emit: updown + + script: + """ + $project_dir/../bin/publish_from_config.py \ + --recipes ${recipe} \ + --date ${params.date} \ + --gisaid_fasta ${gisaid_fasta} \ + --gisaid_metadata ${gisaid_metadata} \ + --mutations ${gisaid_mutations} \ + --constellations ${gisaid_constellations} \ + --updown ${gisaid_updown} + """ +} + + +process announce_to_webhook { + input: + file published_files + val name + + script: + if (params.webhook) + """ + echo '{"text":"' > announce.json + echo "*${name} Complete*\\n" >> announce.json + echo "> Dev outputs in : ${publish_dev}\\n" >> announce.json + echo "> Publishable outputs in : ${publish_dir}\\n" >> announce.json + echo '"}' >> announce.json + echo 'webhook ${params.webhook}' + + curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} + """ + else + """ + touch "announce.json" + """ +} + + +geography_utils = file(params.uk_geography) +cog_global_recipes = file(params.publish_cog_global_recipes) +gisaid_recipes = file(params.publish_gisaid_recipes) + + +workflow publish_cog_global { + take: + uk_unaligned_fasta + uk_aligned_fasta + uk_fasta + uk_metadata + uk_mutations + uk_constellations + uk_updown + gisaid_fasta + gisaid_metadata + gisaid_mutations + gisaid_constellations + gisaid_updown + main: + combine_cog_gisaid(uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata) + combine_mutations(uk_mutations, gisaid_mutations) + combine_constellations(uk_constellations, gisaid_constellations) + combine_updown(uk_updown, gisaid_updown) + split_recipes(cog_global_recipes) + recipe_ch = split_recipes.out.flatten() + uk_unaligned_fasta.combine(uk_aligned_fasta) + .combine(uk_fasta) + .combine(combine_cog_gisaid.out.fasta) + .combine(uk_metadata) + .combine(combine_cog_gisaid.out.metadata) + .combine(combine_mutations.out) + .combine(combine_constellations.out) + .combine(combine_updown.out) + .combine(recipe_ch) + .set{ publish_input_ch } + publish_cog_global_recipes(publish_input_ch) + outputs_ch = publish_cog_global_recipes.out.flag.collect() + announce_to_webhook(outputs_ch, "${params.whoami}") + if ( params.s3 ) + { + publish_s3(publish_cog_global_recipes.out.fasta, publish_cog_global_recipes.out.metadata, publish_cog_global_recipes.out.alignment, publish_cog_global_recipes.out.unmasked_alignment) + } +} + + +workflow publish_gisaid { + take: + gisaid_fasta + gisaid_metadata + gisaid_mutations + gisaid_constellations + gisaid_updown + main: + split_recipes(gisaid_recipes) + recipe_ch = split_recipes.out.flatten() + gisaid_fasta.combine(gisaid_metadata) + .combine(gisaid_mutations) + .combine(gisaid_constellations) + .combine(gisaid_updown) + .combine(recipe_ch) + .set{ publish_input_ch } + publish_gisaid_recipes(publish_input_ch) + outputs_ch = publish_gisaid_recipes.out.all.collect() + emit: + fasta = publish_gisaid_recipes.out.fasta + metadata = publish_gisaid_recipes.out.metadata + mutations = publish_gisaid_recipes.out.mutations + constellations = publish_gisaid_recipes.out.constellations + updown = publish_gisaid_recipes.out.updown + published = outputs_ch +} + + +workflow { + uk_unaligned_fasta = Channel.fromPath(params.uk_unaligned_fasta) + uk_aligned_fasta = Channel.fromPath(params.uk_aligned_fasta) + uk_fasta = Channel.fromPath(params.uk_fasta) + uk_metadata = Channel.fromPath(params.uk_metadata) + uk_mutations = Channel.fromPath(params.uk_mutations) + uk_constellations = Channel.fromPath(params.uk_constellations) + uk_updown = Channel.fromPath(params.uk_updown) + + gisaid_fasta = Channel.fromPath(params.gisaid_fasta) + gisaid_metadata = Channel.fromPath(params.gisaid_metadata) + gisaid_mutations = Channel.fromPath(params.gisaid_mutations) + gisaid_constellations = Channel.fromPath(params.gisaid_constellations) + gisaid_updown = Channel.fromPath(params.gisaid_updown) + + publish_all(uk_unaligned_fasta, + uk_aligned_fasta, + uk_fasta, + uk_metadata, + uk_mutations, + uk_constellations, + uk_updown, + gisaid_fasta, + gisaid_metadata, + gisaid_mutations, + gisaid_constellations, + gisaid_updown) +} diff --git a/workflows/modules/start.nf b/workflows/modules/start.nf new file mode 100644 index 0000000..46931c2 --- /dev/null +++ b/workflows/modules/start.nf @@ -0,0 +1,33 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 +publish_dev = file(params.publish_dev) + +process get_git_hash { + /** + * Gets git commit + */ + publishDir "${publish_dev}", mode: 'copy', overwrite: true + + input: + path commit_file + + output: + path "${commit_file}" + + script: + """ + echo "\n Git hash \t = \t \$( git rev-parse HEAD) \n\n" >> ${commit_file} + """ +} + +workflow start { + params_file = file("${workDir}/input_params.txt") + params_file << "\n#######################################################################################\n\n" + + printMapClosure = { key, value -> + params_file << "$key = $value\n" + } + params.each(printMapClosure) + get_git_hash(params_file) +} diff --git a/workflows/nextflow.config b/workflows/nextflow.config new file mode 100644 index 0000000..0909731 --- /dev/null +++ b/workflows/nextflow.config @@ -0,0 +1,32 @@ +// Global default params, used in configs +workDir = "analysis" + +params { + + // Boilerplate options + help = false + + // cache option makes it a bit easier to set conda or singularity cacheDir + cache = '' + +} + +includeConfig 'config/base.config' + +process { + errorStrategy = { 'retry' } + maxRetries = 5 + + withLabel: retry_increasing_mem { + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + memory = {4.GB * task.attempt} + maxRetries = 5 + } +} + +profiles { + slurm { + process.executor = 'slurm' + process.clusterOptions='--account=lomannj-covid-19-realtime-epidemiology --qos=lomannj --time 600:0 --nodes 1' + } +} diff --git a/workflows/process_cog_uk.nf b/workflows/process_cog_uk.nf index 67f231a..8763fb2 100644 --- a/workflows/process_cog_uk.nf +++ b/workflows/process_cog_uk.nf @@ -19,14 +19,16 @@ workflow process_cog_uk { uk_metadata uk_accessions pangolin_updated + uk_pag main: - preprocess_cog_uk(uk_fasta, uk_metadata, uk_accessions) + preprocess_cog_uk(uk_fasta, uk_metadata, uk_accessions, uk_pag) pangolin(preprocess_cog_uk.out.fasta, preprocess_cog_uk.out.metadata, pangolin_updated) deduplicate_cog_uk(preprocess_cog_uk.out.fasta, pangolin.out.metadata) align_and_variant_call(deduplicate_cog_uk.out.fasta, deduplicate_cog_uk.out.metadata, "cog") filter_and_trim_cog_uk(align_and_variant_call.out.fasta, align_and_variant_call.out.metadata) clean_geography_cog_uk(filter_and_trim_cog_uk.out.fasta, filter_and_trim_cog_uk.out.metadata) emit: + preprocess_cog_uk.out.metadata unaligned_fasta = deduplicate_cog_uk.out.fasta aligned_fasta = align_and_variant_call.out.fasta trimmed_fasta = filter_and_trim_cog_uk.out.fasta @@ -42,12 +44,14 @@ workflow { ch_uk_fasta = Channel.fromPath(params.uk_fasta) ch_uk_metadata = Channel.fromPath(params.uk_metadata) ch_uk_accessions = Channel.fromPath(params.uk_accessions) + ch_uk_pag = Channel.fromPath(params.uk_pag) check_for_pangolin_update() process_cog_uk(ch_uk_fasta, ch_uk_metadata, ch_uk_accessions, - check_for_pangolin_update.out) + check_for_pangolin_update.out, + ch_uk_pag) ch_gisaid_fasta = Channel.fromPath(params.gisaid_fasta) ch_gisaid_metadata = Channel.fromPath(params.gisaid_metadata) diff --git a/workflows/resources/AAs.csv b/workflows/resources/AAs.csv new file mode 100644 index 0000000..6bab385 --- /dev/null +++ b/workflows/resources/AAs.csv @@ -0,0 +1,10 @@ +t1001i,3266 +p323l,14407 +a222v,22226 +n439k,22877 +y453f,22919 +e484k,23012 +n501y,23063 +d614g,23402 +p681h,23603 +q27stop,27972 diff --git a/workflows/resources/MN908947.fa b/workflows/resources/MN908947.fa new file mode 100644 index 0000000..e1cfd92 --- /dev/null +++ b/workflows/resources/MN908947.fa @@ -0,0 +1,429 @@ +>MN908947.3 +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA +CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC +TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG +TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC +CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC +GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG +CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT +GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT +TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA +GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG +TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG +CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG +TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG +CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA +ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC +CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA +GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT +ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG +GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG +CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA +CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA +ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT +TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG +GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG +TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC +GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG +ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG +GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT +AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT +AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA +GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC +TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT +AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA +GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT +ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA +GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA +ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA +TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG +AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT +TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA +CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC +AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA +GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA +CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG +TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT +GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT +TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA +TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT +GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC +TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA +GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG +ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT +GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT +GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA +TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC +AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA +CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA +TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT +TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG +AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA +ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC +CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA +AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT +AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG +ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA +TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA +ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA +ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT +GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT +GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC +ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT +GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG +ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG +TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG +TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC +CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA +CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC +CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG +TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA +ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA +AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA +TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT +ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG +CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA +TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG +CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA +TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC +TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG +GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT +TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA +TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC +TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC +TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT +GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG +GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT +GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA +GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA +TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT +GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT +AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT +AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG +AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT +TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT +ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG +GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG +AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG +CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT +TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA +ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA +ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC +ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC +ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT +TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT +GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC +TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC +AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA +GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG +GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA +CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT +ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT +CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG +TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG +CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC +GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC +TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC +TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT +GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG +ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG +ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG +TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT +ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG +TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT +GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT +TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA +ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG +CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA +TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA +ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC +AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC +TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA +TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT +AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG +ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT +CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG +TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC +TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG +GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA +GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC +TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA +AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA +GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA +GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA +TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA +GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA +TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA +ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT +TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG +TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT +GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA +CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT +TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT +ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT +ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT +GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT +GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG +TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA +AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG +TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA +CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA +ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT +GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC +CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA +ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC +ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG +ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA +GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG +TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT +TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA +AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG +ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG +ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG +CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC +AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT +GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC +AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG +ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT +AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC +TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC +TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG +GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG +TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT +AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA +GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC +AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC +ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC +AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT +GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA +TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC +AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC +GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA +TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA +CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT +AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT +ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT +GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT +GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA +TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT +GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG +AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT +TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC +GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT +AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC +TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG +GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT +GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT +AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT +TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC +ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT +TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC +TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA +GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC +AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA +TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC +TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT +GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC +TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA +CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT +CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA +GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG +TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA +TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA +GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC +TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT +TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA +TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA +TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT +AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA +AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT +GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG +TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG +AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC +ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC +CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT +AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT +TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG +ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA +GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA +AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA +CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA +CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT +TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC +CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA +AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT +AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA +CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA +CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT +ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG +ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA +AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC +GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA +TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC +GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT +TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA +ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT +AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG +GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG +GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA +AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG +GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA +CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA +ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA +GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG +TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC +CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT +TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC +GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC +AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA +TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT +TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA +GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT +GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT +GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG +TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC +ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT +GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG +ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC +TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT +TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT +TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC +AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC +TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA +GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC +CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA +CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT +CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG +GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC +CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA +ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA +TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC +AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA +TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC +TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG +ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG +GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA +AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA +CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA +TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG +TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA +TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA +GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT +TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA +CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC +TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA +GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG +CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT +ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG +ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA +ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC +GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT +TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT +GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG +GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT +CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA +CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG +GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA +ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC +CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT +TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC +TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT +GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT +CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA +TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC +CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA +AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT +AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC +ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT +GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA +GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG +ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG +CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC +TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA +AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC +CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA +TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT +TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT +GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT +ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG +CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA +GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG +TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA +TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT +GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA +AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG +ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG +TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT +GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC +CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT +GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA +AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC +ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT +AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA +ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG +TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG +CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA +ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG +TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC +TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC +TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT +TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG +CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT +GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC +GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT +TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAA diff --git a/workflows/resources/MN908947.gb b/workflows/resources/MN908947.gb new file mode 100644 index 0000000..261614d --- /dev/null +++ b/workflows/resources/MN908947.gb @@ -0,0 +1,798 @@ +LOCUS MN908947 29903 bp ss-RNA linear VRL 11-FEB-2020 +DEFINITION Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, + complete genome. +ACCESSION MN908947 +VERSION MN908947.3 +KEYWORDS . +SOURCE Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) + ORGANISM Severe acute respiratory syndrome coronavirus 2 + Viruses; Riboviria; Nidovirales; Cornidovirineae; Coronaviridae; + Orthocoronavirinae; Betacoronavirus; Sarbecovirus. +REFERENCE 1 (bases 1 to 29903) + AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., + Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., + Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C. and Zhang,Y.Z. + TITLE A new coronavirus associated with human respiratory disease in + China + JOURNAL Nature (2020) In press + PUBMED 32015508 + REMARK Publication Status: Available-Online prior to print +REFERENCE 2 (bases 1 to 29903) + AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.-M., Wang,W., Hu,Y., Song,Z.-G., + Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Yuan,M.L., Zhang,Y.-L., + Dai,F.-H., Liu,Y., Wang,Q.-M., Zheng,J.-J., Xu,L., Holmes,E.C. and + Zhang,Y.-Z. + TITLE Direct Submission + JOURNAL Submitted (05-JAN-2020) Shanghai Public Health Clinical Center & + School of Public Health, Fudan University, Shanghai, China +COMMENT On Jan 17, 2020 this sequence version replaced MN908947.2. + + ##Assembly-Data-START## + Assembly Method :: Megahit v. V1.1.3 + Sequencing Technology :: Illumina + ##Assembly-Data-END## +FEATURES Location/Qualifiers + source 1..29903 + /organism="Severe acute respiratory syndrome coronavirus + 2" + /mol_type="genomic RNA" + /isolate="Wuhan-Hu-1" + /host="Homo sapiens" + /db_xref="taxon:2697049" + /country="China" + /collection_date="Dec-2019" + 5'UTR 1..265 + gene 266..21555 + /gene="orf1ab" + CDS join(266..13468,13468..21555) + /gene="orf1ab" + /ribosomal_slippage + /note="pp1ab; translated by -1 ribosomal frameshift" + /codon_start=1 + /product="orf1ab polyprotein" + /protein_id="QHD43415.1" + /translation="MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQ + HLKDGTCGLVEVEKGVLPQLEQPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGE + TLGVLVPHVGEIPVAYRKVLLRKNGNKGAGGHSYGADLKSFDLGDELGTDPYEDFQEN + WNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQ + LDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFP + LNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTG + DFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESG + LKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNL + LEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGN + FKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAA + ITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKL + KPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLV + NKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEII + FLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEK + YCALAPNMMVTNNTFTLKGGAPTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEK + CSAYTVELGTEVNEFACVVADAVIKTLQPVSELLTPLGIDLDEWSMATYYLFDESGEF + KLASHMYCSFYPPDEDEEEGDCEEEEFEPSTQYEYGTEDDYQGKPLEFGATSAALQPE + EEQEEDWLDDDSQQTVGQQDGSEDNQTTTIQTIVEVQPQLEMELTPVVQTIEVNSFSG + YLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNKATNNAMQVESD + DYIATNGPLKVGGSCVLSGHNLAKHCLHVVGPNVNKGEDIQLLKSAYENFNQHEVLLA + PLLSAGIFGADPIHSLRVCVDTVRTNVYLAVFDKNLYDKLVSSFLEMKSEKQVEQKIA + EIPKEEVKPFITESKPSVEQRKQDDKKIKACVEEVTTTLEETKFLTENLLLYIDINGN + LHPDSATLVSDIDITFLKKDAPYIVGDVVQEGVLTAVVIPTKKAGGTTEMLAKALRKV + PTDNYITTYPGQGLNGYTVEEAKTVLKKCKSAFYILPSIISNEKQEILGTVSWNLREM + LAHAEETRKLMPVCVETKAIVSTIQRKYKGIKIQEGVVDYGARFYFYTSKTTVASLIN + TLNDLNETLVTMPLGYVTHGLNLEEAARYMRSLKVPATVSVSSPDAVTAYNGYLTSSS + KTPEEHFIETISLAGSYKDWSYSGQSTQLGIEFLKRGDKSVYYTSNPTTFHLDGEVIT + FDNLKTLLSLREVRTIKVFTTVDNINLHTQVVDMSMTYGQQFGPTYLDGADVTKIKPH + NSHEGKTFYVLPNDDTLRVEAFEYYHTTDPSFLGRYMSALNHTKKWKYPQVNGLTSIK + WADNNCYLATALLTLQQIELKFNPPALQDAYYRARAGEAANFCALILAYCNKTVGELG + DVRETMSYLFQHANLDSCKRVLNVVCKTCGQQQTTLKGVEAVMYMGTLSYEQFKKGVQ + IPCTCGKQATKYLVQQESPFVMMSAPPAQYELKHGTFTCASEYTGNYQCGHYKHITSK + ETLYCIDGALLTKSSEYKGPITDVFYKENSYTTTIKPVTYKLDGVVCTEIDPKLDNYY + KKDNSYFTEQPIDLVPNQPYPNASFDNFKFVCDNIKFADDLNQLTGYKKPASRELKVT + FFPDLNGDVVAIDYKHYTPSFKKGAKLLHKPIVWHVNNATNKATYKPNTWCIRCLWST + KPVETSNSFDVLKSEDAQGMDNLACEDLKPVSEEVVENPTIQKDVLECNVKTTEVVGD + IILKPANNSLKITEEVGHTDLMAAYVDNSSLTIKKPNELSRVLGLKTLATHGLAAVNS + VPWDTIANYAKPFLNKVVSTTTNIVTRCLNRVCTNYMPYFFTLLLQLCTFTRSTNSRI + KASMPTTIAKNTVKSVGKFCLEASFNYLKSPNFSKLINIIIWFLLLSVCLGSLIYSTA + ALGVLMSNLGMPSYCTGYREGYLNSTNVTIATYCTGSIPCSVCLSGLDSLDTYPSLET + IQITISSFKWDLTAFGLVAEWFLAYILFTRFFYVLGLAAIMQLFFSYFAVHFISNSWL + MWLIINLVQMAPISAMVRMYIFFASFYYVWKSYVHVVDGCNSSTCMMCYKRNRATRVE + CTTIVNGVRRSFYVYANGGKGFCKLHNWNCVNCDTFCAGSTFISDEVARDLSLQFKRP + INPTDQSSYIVDSVTVKNGSIHLYFDKAGQKTYERHSLSHFVNLDNLRANNTKGSLPI + NVIVFDGKSKCEESSAKSASVYYSQLMCQPILLLDQALVSDVGDSAEVAVKMFDAYVN + TFSSTFNVPMEKLKTLVATAEAELAKNVSLDNVLSTFISAARQGFVDSDVETKDVVEC + LKLSHQSDIEVTGDSCNNYMLTYNKVENMTPRDLGACIDCSARHINAQVAKSHNIALI + WNVKDFMSLSEQLRKQIRSAAKKNNLPFKLTCATTRQVVNVVTTKIALKGGKIVNNWL + KQLIKVTLVFLFVAAIFYLITPVHVMSKHTDFSSEIIGYKAIDGGVTRDIASTDTCFA + NKHADFDTWFSQRGGSYTNDKACPLIAAVITREVGFVVPGLPGTILRTTNGDFLHFLP + RVFSAVGNICYTPSKLIEYTDFATSACVLAAECTIFKDASGKPVPYCYDTNVLEGSVA + YESLRPDTRYVLMDGSIIQFPNTYLEGSVRVVTTFDSEYCRHGTCERSEAGVCVSTSG + RWVLNNDYYRSLPGVFCGVDAVNLLTNMFTPLIQPIGALDISASIVAGGIVAIVVTCL + AYYFMRFRRAFGEYSHVVAFNTLLFLMSFTVLCLTPVYSFLPGVYSVIYLYLTFYLTN + DVSFLAHIQWMVMFTPLVPFWITIAYIICISTKHFYWFFSNYLKRRVVFNGVSFSTFE + EAALCTFLLNKEMYLKLRSDVLLPLTQYNRYLALYNKYKYFSGAMDTTSYREAACCHL + AKALNDFSNSGSDVLYQPPQTSITSAVLQSGFRKMAFPSGKVEGCMVQVTCGTTTLNG + LWLDDVVYCPRHVICTSEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVL + KLKVDTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIKGSFLNGSC + GSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTTITVN + VLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAV + LDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQSAVKRTIKGTHHW + LLLTILTSLLVLVQSTQWSLFFFLYENAFLPFAMGIIAMSAFAMMFVKHKHAFLCLFL + LPSLATVAYFNMVYMPASWVMRIMTWLDMVDTSLSGFKLKDCVMYASAVVLLILMTAR + TVYDDGARRVWTLMNVLTLVYKVYYGNALDQAISMWALIISVTSNYSGVVTTVMFLAR + GIVFMCVEYCPIFFITGNTLQCIMLVYCFLGYFCTCYFGLFCLLNRYFRLTLGVYDYL + VSTQEFRYMNSQGLLPPKNSIDAFKLNIKLLGVGGKPCIKVATVQSKMSDVKCTSVVL + LSVLQQLRVESSSKLWAQCVQLHNDILLAKDTTEAFEKMVSLLSVLLSMQGAVDINKL + CEEMLDNRATLQAIASEFSSLPSYAAFATAQEAYEQAVANGDSEVVLKKLKKSLNVAK + SEFDRDAAMQRKLEKMADQAMTQMYKQARSEDKRAKVTSAMQTMLFTMLRKLDNDALN + NIINNARDGCVPLNIIPLTTAAKLMVVIPDYNTYKNTCDGTTFTYASALWEIQQVVDA + DSKIVQLSEISMDNSPNLAWPLIVTALRANSAVKLQNNELSPVALRQMSCAAGTTQTA + CTDDNALAYYNTTKGGRFVLALLSDLQDLKWARFPKSDGTGTIYTELEPPCRFVTDTP + KGPKVKYLYFIKGLNNLNRGMVLGSLAATVRLQAGNATEVPANSTVLSFCAFAVDAAK + AYKDYLASGGQPITNCVKMLCTHTGTGQAITVTPEANMDQESFGGASCCLYCRCHIDH + PNPKGFCDLKGKYVQIPTTCANDPVGFTLKNTVCTVCGMWKGYGCSCDQLREPMLQSA + DAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFLKTNCCRFQEKD + EDDNLIDSYFVVKRHTFSNYQHEETIYNLLKDCPAVAKHDFFKFRIDGDMVPHISRQR + LTKYTMADLVYALRHFDEGNCDTLKEILVTYNCCDDDYFNKKDWYDFVENPDILRVYA + NLGERVRQALLKTVQFCDAMRNAGIVGVLTLDNQDLNGNWYDFGDFIQTTPGSGVPVV + DSYYSLLMPILTLTRALTAESHVDTDLTKPYIKWDLLKYDFTEERLKLFDRYFKYWDQ + TYHPNCVNCLDDRCILHCANFNVLFSTVFPPTSFGPLVRKIFVDGVPFVVSTGYHFRE + LGVVHNQDVNLHSSRLSFKELLVYAADPAMHAASGNLLLDKRTTCFSVAALTNNVAFQ + TVKPGNFNKDFYDFAVSKGFFKEGSSVELKHFFFAQDGNAAISDYDYYRYNLPTMCDI + RQLLFVVEVVDKYFDCYDGGCINANQVIVNNLDKSAGFPFNKWGKARLYYDSMSYEDQ + DALFAYTKRNVIPTITQMNLKYAISAKNRARTVAGVSICSTMTNRQFHQKLLKSIAAT + RGATVVIGTSKFYGGWHNMLKTVYSDVENPHLMGWDYPKCDRAMPNMLRIMASLVLAR + KHTTCCSLSHRFYRLANECAQVLSEMVMCGGSLYVKPGGTSSGDATTAYANSVFNICQ + AVTANVNALLSTDGNKIADKYVRNLQHRLYECLYRNRDVDTDFVNEFYAYLRKHFSMM + ILSDDAVVCFNSTYASQGLVASIKNFKSVLYYQNNVFMSEAKCWTETDLTKGPHEFCS + QHTMLVKQGDDYVYLPYPDPSRILGAGCFVDDIVKTDGTLMIERFVSLAIDAYPLTKH + PNQEYADVFHLYLQYIRKLHDELTGHMLDMYSVMLTNDNTSRYWEPEFYEAMYTPHTV + LQAVGACVLCNSQTSLRCGACIRRPFLCCKCCYDHVISTSHKLVLSVNPYVCNAPGCD + VTDVTQLYLGGMSYYCKSHKPPISFPLCANGQVFGLYKNTCVGSDNVTDFNAIATCDW + TNAGDYILANTCTERLKLFAAETLKATEETFKLSYGIATVREVLSDRELHLSWEVGKP + RPPLNRNYVFTGYRVTKNSKVQIGEYTFEKGDYGDAVVYRGTTTYKLNVGDYFVLTSH + TVMPLSAPTLVPQEHYVRITGLYPTLNISDEFSSNVANYQKVGMQKYSTLQGPPGTGK + SHFAIGLALYYPSARIVYTACSHAAVDALCEKALKYLPIDKCSRIIPARARVECFDKF + KVNSTLEQYVFCTVNALPETTADIVVFDEISMATNYDLSVVNARLRAKHYVYIGDPAQ + LPAPRTLLTKGTLEPEYFNSVCRLMKTIGPDMFLGTCRRCPAEIVDTVSALVYDNKLK + AHKDKSAQCFKMFYKGVITHDVSSAINRPQIGVVREFLTRNPAWRKAVFISPYNSQNA + VASKILGLPTQTVDSSQGSEYDYVIFTQTTETAHSCNVNRFNVAITRAKVGILCIMSD + RDLYDKLQFTSLEIPRRNVATLQAENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKT + EGLCVDIPGIPKDMTYRRLISMMGFKMNYQVNGYPNMFITREEAIRHVRAWIGFDVEG + CHATREAVGTNLPLQLGFSTGVNLVAVPTGYVDTPNNTDFSRVSAKPPPGDQFKHLIP + LMYKGLPWNVVRIKIVQMLSDTLKNLSDRVVFVLWAHGFELTSMKYFVKIGPERTCCL + CDRRATCFSTASDTYACWHHSIGFDYVYNPFMIDVQQWGFTGNLQSNHDLYCQVHGNA + HVASCDAIMTRCLAVHECFVKRVDWTIEYPIIGDELKINAACRKVQHMVVKAALLADK + FPVLHDIGNPKAIKCVPQADVEWKFYDAQPCSDKAYKIEELFYSYATHSDKFTDGVCL + FWNCNVDRYPANSIVCRFDTRVLSNLNLPGCDGGSLYVNKHAFHTPAFDKSAFVNLKQ + LPFFYYSDSPCESHGKQVVSDIDYVPLKSATCITRCNLGGAVCRHHANEYRLYLDAYN + MMISAGFSLWVYKQFDTYNLWNTFTRLQSLENVAFNVVNKGHFDGQQGEVPVSIINNT + VYTKVDGVDVELFENKTTLPVNVAFELWAKRNIKPVPEVKILNNLGVDIAANTVIWDY + KRDAPAHISTIGVCSMTDIAKKPTETICAPLTVFFDGRVDGQVDLFRNARNGVLITEG + SVKGLQPSVGPKQASLNGVTLIGEAVKTQFNYYKKVDGVVQQLPETYFTQSRNLQEFK + PRSQMEIDFLELAMDEFIERYKLEGYAFEHIVYGDFSHSQLGGLHLLIGLAKRFKESP + FELEDFIPMDSTVKNYFITDAQTGSSKCVCSVIDLLLDDFVEIIKSQDLSVVSKVVKV + TIDYTEISFMLWCKDGHVETFYPKLQSSQAWQPGVAMPNLYKMQRMLLEKCDLQNYGD + SATLPKGIMMNVAKYTQLCQYLNTLTLAVPYNMRVIHFGAGSDKGVAPGTAVLRQWLP + TGTLLVDSDLNDFVSDADSTLIGDCATVHTANKWDLIISDMYDPKTKNVTKENDSKEG + FFTYICGFIQQKLALGGSVAIKITEHSWNADLYKLMGHFAWWTAFVTNVNASSSEAFL + IGCNYLGKPREQIDGYVMHANYIFWRNTNPIQLSSYSLFDMSKFPLKLRGTAVMSLKE + GQINDMILSLLSKGRLIIRENNRVVISSDVLVNN" + gene 21563..25384 + /gene="S" + CDS 21563..25384 + /gene="S" + /note="structural protein" + /codon_start=1 + /product="surface glycoprotein" + /protein_id="QHD43416.1" + /translation="MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFR + SSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIR + GWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVY + SSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQ + GFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFL + LKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITN + LCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCF + TNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYN + YLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPY + RVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFG + RDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAI + HADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPR + RARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTM + YICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFG + GFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFN + GLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQN + VLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGA + ISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMS + ECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAH + FPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELD + SFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELG + KYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSE + PVLKGVKLHYT" + gene 25393..26220 + /gene="ORF3a" + CDS 25393..26220 + /gene="ORF3a" + /codon_start=1 + /product="ORF3a protein" + /protein_id="QHD43417.1" + /translation="MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFG + WLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLE + APFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPY + NSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQ + LSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL" + gene 26245..26472 + /gene="E" + CDS 26245..26472 + /gene="E" + /note="structural protein; E protein" + /codon_start=1 + /product="envelope protein" + /protein_id="QHD43418.1" + /translation="MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCC + NIVNVSLVKPSFYVYSRVKNLNSSRVPDLLV" + gene 26523..27191 + /gene="M" + CDS 26523..27191 + /gene="M" + /note="structural protein" + /codon_start=1 + /product="membrane glycoprotein" + /protein_id="QHD43419.1" + /translation="MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNR + FLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRL + FARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCD + IKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIA + LLVQ" + gene 27202..27387 + /gene="ORF6" + CDS 27202..27387 + /gene="ORF6" + /codon_start=1 + /product="ORF6 protein" + /protein_id="QHD43420.1" + /translation="MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL + TENKYSQLDEEQPMEID" + gene 27394..27759 + /gene="ORF7a" + CDS 27394..27759 + /gene="ORF7a" + /codon_start=1 + /product="ORF7a protein" + /protein_id="QHD43421.1" + /translation="MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS + PFHPLADNKFALTCFSTQFAFACPDGVKHVYQLRARSVSPKLFIRQEEVQELYSPIFL + IVAAIVFITLCFTLKRKTE" + gene 27894..28259 + /gene="ORF8" + CDS 27894..28259 + /gene="ORF8" + /codon_start=1 + /product="ORF8 protein" + /protein_id="QHD43422.1" + /translation="MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSK + WYIRVGARKSAPLIELCVDEAGSKSPIQYIDIGNYTVSCLPFTINCQEPKLGSLVVRC + SFYEDFLEYHDVRVVLDFI" + gene 28274..29533 + /gene="N" + CDS 28274..29533 + /gene="N" + /note="structural protein" + /codon_start=1 + /product="nucleocapsid phosphoprotein" + /protein_id="QHD43423.2" + /translation="MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQG + LPNNTASWFTALTQHGKEDLKFPRGQGVPINTNSSPDDQIGYYRRATRRIRGGDGKMK + DLSPRWYFYYLGTGPEAGLPYGANKDGIIWVATEGALNTPKDHIGTRNPANNAAIVLQ + LPQGTTLPKGFYAEGSRGGSQASSRSSSRSRNSSRNSTPGSSRGTSPARMAGNGGDAA + LALLLLDRLNQLESKMSGKGQQQQGQTVTKKSAAEASKKPRQKRTATKAYNVTQAFGR + RGPEQTQGNFGDQELIRQGTDYKHWPQIAQFAPSASAFFGMSRIGMEVTPSGTWLTYT + GAIKLDDKDPNFKDQVILLNKHIDAYKTFPPTEPKKDKKKKADETQALPQRQKKQQTV + TLLPAADLDDFSKQLQQSMSSADSTQA" + gene 29558..29674 + /gene="ORF10" + CDS 29558..29674 + /gene="ORF10" + /codon_start=1 + /product="ORF10 protein" + /protein_id="QHI42199.1" + /translation="MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT" + 3'UTR 29675..29903 +ORIGIN + 1 attaaaggtt tataccttcc caggtaacaa accaaccaac tttcgatctc ttgtagatct + 61 gttctctaaa cgaactttaa aatctgtgtg gctgtcactc ggctgcatgc ttagtgcact + 121 cacgcagtat aattaataac taattactgt cgttgacagg acacgagtaa ctcgtctatc + 181 ttctgcaggc tgcttacggt ttcgtccgtg ttgcagccga tcatcagcac atctaggttt + 241 cgtccgggtg tgaccgaaag gtaagatgga gagccttgtc cctggtttca acgagaaaac + 301 acacgtccaa ctcagtttgc ctgttttaca ggttcgcgac gtgctcgtac gtggctttgg + 361 agactccgtg gaggaggtct tatcagaggc acgtcaacat cttaaagatg gcacttgtgg + 421 cttagtagaa gttgaaaaag gcgttttgcc tcaacttgaa cagccctatg tgttcatcaa + 481 acgttcggat gctcgaactg cacctcatgg tcatgttatg gttgagctgg tagcagaact + 541 cgaaggcatt cagtacggtc gtagtggtga gacacttggt gtccttgtcc ctcatgtggg + 601 cgaaatacca gtggcttacc gcaaggttct tcttcgtaag aacggtaata aaggagctgg + 661 tggccatagt tacggcgccg atctaaagtc atttgactta ggcgacgagc ttggcactga + 721 tccttatgaa gattttcaag aaaactggaa cactaaacat agcagtggtg ttacccgtga + 781 actcatgcgt gagcttaacg gaggggcata cactcgctat gtcgataaca acttctgtgg + 841 ccctgatggc taccctcttg agtgcattaa agaccttcta gcacgtgctg gtaaagcttc + 901 atgcactttg tccgaacaac tggactttat tgacactaag aggggtgtat actgctgccg + 961 tgaacatgag catgaaattg cttggtacac ggaacgttct gaaaagagct atgaattgca + 1021 gacacctttt gaaattaaat tggcaaagaa atttgacacc ttcaatgggg aatgtccaaa + 1081 ttttgtattt cccttaaatt ccataatcaa gactattcaa ccaagggttg aaaagaaaaa + 1141 gcttgatggc tttatgggta gaattcgatc tgtctatcca gttgcgtcac caaatgaatg + 1201 caaccaaatg tgcctttcaa ctctcatgaa gtgtgatcat tgtggtgaaa cttcatggca + 1261 gacgggcgat tttgttaaag ccacttgcga attttgtggc actgagaatt tgactaaaga + 1321 aggtgccact acttgtggtt acttacccca aaatgctgtt gttaaaattt attgtccagc + 1381 atgtcacaat tcagaagtag gacctgagca tagtcttgcc gaataccata atgaatctgg + 1441 cttgaaaacc attcttcgta agggtggtcg cactattgcc tttggaggct gtgtgttctc + 1501 ttatgttggt tgccataaca agtgtgccta ttgggttcca cgtgctagcg ctaacatagg + 1561 ttgtaaccat acaggtgttg ttggagaagg ttccgaaggt cttaatgaca accttcttga + 1621 aatactccaa aaagagaaag tcaacatcaa tattgttggt gactttaaac ttaatgaaga + 1681 gatcgccatt attttggcat ctttttctgc ttccacaagt gcttttgtgg aaactgtgaa + 1741 aggtttggat tataaagcat tcaaacaaat tgttgaatcc tgtggtaatt ttaaagttac + 1801 aaaaggaaaa gctaaaaaag gtgcctggaa tattggtgaa cagaaatcaa tactgagtcc + 1861 tctttatgca tttgcatcag aggctgctcg tgttgtacga tcaattttct cccgcactct + 1921 tgaaactgct caaaattctg tgcgtgtttt acagaaggcc gctataacaa tactagatgg + 1981 aatttcacag tattcactga gactcattga tgctatgatg ttcacatctg atttggctac + 2041 taacaatcta gttgtaatgg cctacattac aggtggtgtt gttcagttga cttcgcagtg + 2101 gctaactaac atctttggca ctgtttatga aaaactcaaa cccgtccttg attggcttga + 2161 agagaagttt aaggaaggtg tagagtttct tagagacggt tgggaaattg ttaaatttat + 2221 ctcaacctgt gcttgtgaaa ttgtcggtgg acaaattgtc acctgtgcaa aggaaattaa + 2281 ggagagtgtt cagacattct ttaagcttgt aaataaattt ttggctttgt gtgctgactc + 2341 tatcattatt ggtggagcta aacttaaagc cttgaattta ggtgaaacat ttgtcacgca + 2401 ctcaaaggga ttgtacagaa agtgtgttaa atccagagaa gaaactggcc tactcatgcc + 2461 tctaaaagcc ccaaaagaaa ttatcttctt agagggagaa acacttccca cagaagtgtt + 2521 aacagaggaa gttgtcttga aaactggtga tttacaacca ttagaacaac ctactagtga + 2581 agctgttgaa gctccattgg ttggtacacc agtttgtatt aacgggctta tgttgctcga + 2641 aatcaaagac acagaaaagt actgtgccct tgcacctaat atgatggtaa caaacaatac + 2701 cttcacactc aaaggcggtg caccaacaaa ggttactttt ggtgatgaca ctgtgataga + 2761 agtgcaaggt tacaagagtg tgaatatcac ttttgaactt gatgaaagga ttgataaagt + 2821 acttaatgag aagtgctctg cctatacagt tgaactcggt acagaagtaa atgagttcgc + 2881 ctgtgttgtg gcagatgctg tcataaaaac tttgcaacca gtatctgaat tacttacacc + 2941 actgggcatt gatttagatg agtggagtat ggctacatac tacttatttg atgagtctgg + 3001 tgagtttaaa ttggcttcac atatgtattg ttctttctac cctccagatg aggatgaaga + 3061 agaaggtgat tgtgaagaag aagagtttga gccatcaact caatatgagt atggtactga + 3121 agatgattac caaggtaaac ctttggaatt tggtgccact tctgctgctc ttcaacctga + 3181 agaagagcaa gaagaagatt ggttagatga tgatagtcaa caaactgttg gtcaacaaga + 3241 cggcagtgag gacaatcaga caactactat tcaaacaatt gttgaggttc aacctcaatt + 3301 agagatggaa cttacaccag ttgttcagac tattgaagtg aatagtttta gtggttattt + 3361 aaaacttact gacaatgtat acattaaaaa tgcagacatt gtggaagaag ctaaaaaggt + 3421 aaaaccaaca gtggttgtta atgcagccaa tgtttacctt aaacatggag gaggtgttgc + 3481 aggagcctta aataaggcta ctaacaatgc catgcaagtt gaatctgatg attacatagc + 3541 tactaatgga ccacttaaag tgggtggtag ttgtgtttta agcggacaca atcttgctaa + 3601 acactgtctt catgttgtcg gcccaaatgt taacaaaggt gaagacattc aacttcttaa + 3661 gagtgcttat gaaaatttta atcagcacga agttctactt gcaccattat tatcagctgg + 3721 tatttttggt gctgacccta tacattcttt aagagtttgt gtagatactg ttcgcacaaa + 3781 tgtctactta gctgtctttg ataaaaatct ctatgacaaa cttgtttcaa gctttttgga + 3841 aatgaagagt gaaaagcaag ttgaacaaaa gatcgctgag attcctaaag aggaagttaa + 3901 gccatttata actgaaagta aaccttcagt tgaacagaga aaacaagatg ataagaaaat + 3961 caaagcttgt gttgaagaag ttacaacaac tctggaagaa actaagttcc tcacagaaaa + 4021 cttgttactt tatattgaca ttaatggcaa tcttcatcca gattctgcca ctcttgttag + 4081 tgacattgac atcactttct taaagaaaga tgctccatat atagtgggtg atgttgttca + 4141 agagggtgtt ttaactgctg tggttatacc tactaaaaag gctggtggca ctactgaaat + 4201 gctagcgaaa gctttgagaa aagtgccaac agacaattat ataaccactt acccgggtca + 4261 gggtttaaat ggttacactg tagaggaggc aaagacagtg cttaaaaagt gtaaaagtgc + 4321 cttttacatt ctaccatcta ttatctctaa tgagaagcaa gaaattcttg gaactgtttc + 4381 ttggaatttg cgagaaatgc ttgcacatgc agaagaaaca cgcaaattaa tgcctgtctg + 4441 tgtggaaact aaagccatag tttcaactat acagcgtaaa tataagggta ttaaaataca + 4501 agagggtgtg gttgattatg gtgctagatt ttacttttac accagtaaaa caactgtagc + 4561 gtcacttatc aacacactta acgatctaaa tgaaactctt gttacaatgc cacttggcta + 4621 tgtaacacat ggcttaaatt tggaagaagc tgctcggtat atgagatctc tcaaagtgcc + 4681 agctacagtt tctgtttctt cacctgatgc tgttacagcg tataatggtt atcttacttc + 4741 ttcttctaaa acacctgaag aacattttat tgaaaccatc tcacttgctg gttcctataa + 4801 agattggtcc tattctggac aatctacaca actaggtata gaatttctta agagaggtga + 4861 taaaagtgta tattacacta gtaatcctac cacattccac ctagatggtg aagttatcac + 4921 ctttgacaat cttaagacac ttctttcttt gagagaagtg aggactatta aggtgtttac + 4981 aacagtagac aacattaacc tccacacgca agttgtggac atgtcaatga catatggaca + 5041 acagtttggt ccaacttatt tggatggagc tgatgttact aaaataaaac ctcataattc + 5101 acatgaaggt aaaacatttt atgttttacc taatgatgac actctacgtg ttgaggcttt + 5161 tgagtactac cacacaactg atcctagttt tctgggtagg tacatgtcag cattaaatca + 5221 cactaaaaag tggaaatacc cacaagttaa tggtttaact tctattaaat gggcagataa + 5281 caactgttat cttgccactg cattgttaac actccaacaa atagagttga agtttaatcc + 5341 acctgctcta caagatgctt attacagagc aagggctggt gaagctgcta acttttgtgc + 5401 acttatctta gcctactgta ataagacagt aggtgagtta ggtgatgtta gagaaacaat + 5461 gagttacttg tttcaacatg ccaatttaga ttcttgcaaa agagtcttga acgtggtgtg + 5521 taaaacttgt ggacaacagc agacaaccct taagggtgta gaagctgtta tgtacatggg + 5581 cacactttct tatgaacaat ttaagaaagg tgttcagata ccttgtacgt gtggtaaaca + 5641 agctacaaaa tatctagtac aacaggagtc accttttgtt atgatgtcag caccacctgc + 5701 tcagtatgaa cttaagcatg gtacatttac ttgtgctagt gagtacactg gtaattacca + 5761 gtgtggtcac tataaacata taacttctaa agaaactttg tattgcatag acggtgcttt + 5821 acttacaaag tcctcagaat acaaaggtcc tattacggat gttttctaca aagaaaacag + 5881 ttacacaaca accataaaac cagttactta taaattggat ggtgttgttt gtacagaaat + 5941 tgaccctaag ttggacaatt attataagaa agacaattct tatttcacag agcaaccaat + 6001 tgatcttgta ccaaaccaac catatccaaa cgcaagcttc gataatttta agtttgtatg + 6061 tgataatatc aaatttgctg atgatttaaa ccagttaact ggttataaga aacctgcttc + 6121 aagagagctt aaagttacat ttttccctga cttaaatggt gatgtggtgg ctattgatta + 6181 taaacactac acaccctctt ttaagaaagg agctaaattg ttacataaac ctattgtttg + 6241 gcatgttaac aatgcaacta ataaagccac gtataaacca aatacctggt gtatacgttg + 6301 tctttggagc acaaaaccag ttgaaacatc aaattcgttt gatgtactga agtcagagga + 6361 cgcgcaggga atggataatc ttgcctgcga agatctaaaa ccagtctctg aagaagtagt + 6421 ggaaaatcct accatacaga aagacgttct tgagtgtaat gtgaaaacta ccgaagttgt + 6481 aggagacatt atacttaaac cagcaaataa tagtttaaaa attacagaag aggttggcca + 6541 cacagatcta atggctgctt atgtagacaa ttctagtctt actattaaga aacctaatga + 6601 attatctaga gtattaggtt tgaaaaccct tgctactcat ggtttagctg ctgttaatag + 6661 tgtcccttgg gatactatag ctaattatgc taagcctttt cttaacaaag ttgttagtac + 6721 aactactaac atagttacac ggtgtttaaa ccgtgtttgt actaattata tgccttattt + 6781 ctttacttta ttgctacaat tgtgtacttt tactagaagt acaaattcta gaattaaagc + 6841 atctatgccg actactatag caaagaatac tgttaagagt gtcggtaaat tttgtctaga + 6901 ggcttcattt aattatttga agtcacctaa tttttctaaa ctgataaata ttataatttg + 6961 gtttttacta ttaagtgttt gcctaggttc tttaatctac tcaaccgctg ctttaggtgt + 7021 tttaatgtct aatttaggca tgccttctta ctgtactggt tacagagaag gctatttgaa + 7081 ctctactaat gtcactattg caacctactg tactggttct ataccttgta gtgtttgtct + 7141 tagtggttta gattctttag acacctatcc ttctttagaa actatacaaa ttaccatttc + 7201 atcttttaaa tgggatttaa ctgcttttgg cttagttgca gagtggtttt tggcatatat + 7261 tcttttcact aggtttttct atgtacttgg attggctgca atcatgcaat tgtttttcag + 7321 ctattttgca gtacatttta ttagtaattc ttggcttatg tggttaataa ttaatcttgt + 7381 acaaatggcc ccgatttcag ctatggttag aatgtacatc ttctttgcat cattttatta + 7441 tgtatggaaa agttatgtgc atgttgtaga cggttgtaat tcatcaactt gtatgatgtg + 7501 ttacaaacgt aatagagcaa caagagtcga atgtacaact attgttaatg gtgttagaag + 7561 gtccttttat gtctatgcta atggaggtaa aggcttttgc aaactacaca attggaattg + 7621 tgttaattgt gatacattct gtgctggtag tacatttatt agtgatgaag ttgcgagaga + 7681 cttgtcacta cagtttaaaa gaccaataaa tcctactgac cagtcttctt acatcgttga + 7741 tagtgttaca gtgaagaatg gttccatcca tctttacttt gataaagctg gtcaaaagac + 7801 ttatgaaaga cattctctct ctcattttgt taacttagac aacctgagag ctaataacac + 7861 taaaggttca ttgcctatta atgttatagt ttttgatggt aaatcaaaat gtgaagaatc + 7921 atctgcaaaa tcagcgtctg tttactacag tcagcttatg tgtcaaccta tactgttact + 7981 agatcaggca ttagtgtctg atgttggtga tagtgcggaa gttgcagtta aaatgtttga + 8041 tgcttacgtt aatacgtttt catcaacttt taacgtacca atggaaaaac tcaaaacact + 8101 agttgcaact gcagaagctg aacttgcaaa gaatgtgtcc ttagacaatg tcttatctac + 8161 ttttatttca gcagctcggc aagggtttgt tgattcagat gtagaaacta aagatgttgt + 8221 tgaatgtctt aaattgtcac atcaatctga catagaagtt actggcgata gttgtaataa + 8281 ctatatgctc acctataaca aagttgaaaa catgacaccc cgtgaccttg gtgcttgtat + 8341 tgactgtagt gcgcgtcata ttaatgcgca ggtagcaaaa agtcacaaca ttgctttgat + 8401 atggaacgtt aaagatttca tgtcattgtc tgaacaacta cgaaaacaaa tacgtagtgc + 8461 tgctaaaaag aataacttac cttttaagtt gacatgtgca actactagac aagttgttaa + 8521 tgttgtaaca acaaagatag cacttaaggg tggtaaaatt gttaataatt ggttgaagca + 8581 gttaattaaa gttacacttg tgttcctttt tgttgctgct attttctatt taataacacc + 8641 tgttcatgtc atgtctaaac atactgactt ttcaagtgaa atcataggat acaaggctat + 8701 tgatggtggt gtcactcgtg acatagcatc tacagatact tgttttgcta acaaacatgc + 8761 tgattttgac acatggttta gccagcgtgg tggtagttat actaatgaca aagcttgccc + 8821 attgattgct gcagtcataa caagagaagt gggttttgtc gtgcctggtt tgcctggcac + 8881 gatattacgc acaactaatg gtgacttttt gcatttctta cctagagttt ttagtgcagt + 8941 tggtaacatc tgttacacac catcaaaact tatagagtac actgactttg caacatcagc + 9001 ttgtgttttg gctgctgaat gtacaatttt taaagatgct tctggtaagc cagtaccata + 9061 ttgttatgat accaatgtac tagaaggttc tgttgcttat gaaagtttac gccctgacac + 9121 acgttatgtg ctcatggatg gctctattat tcaatttcct aacacctacc ttgaaggttc + 9181 tgttagagtg gtaacaactt ttgattctga gtactgtagg cacggcactt gtgaaagatc + 9241 agaagctggt gtttgtgtat ctactagtgg tagatgggta cttaacaatg attattacag + 9301 atctttacca ggagttttct gtggtgtaga tgctgtaaat ttacttacta atatgtttac + 9361 accactaatt caacctattg gtgctttgga catatcagca tctatagtag ctggtggtat + 9421 tgtagctatc gtagtaacat gccttgccta ctattttatg aggtttagaa gagcttttgg + 9481 tgaatacagt catgtagttg cctttaatac tttactattc cttatgtcat tcactgtact + 9541 ctgtttaaca ccagtttact cattcttacc tggtgtttat tctgttattt acttgtactt + 9601 gacattttat cttactaatg atgtttcttt tttagcacat attcagtgga tggttatgtt + 9661 cacaccttta gtacctttct ggataacaat tgcttatatc atttgtattt ccacaaagca + 9721 tttctattgg ttctttagta attacctaaa gagacgtgta gtctttaatg gtgtttcctt + 9781 tagtactttt gaagaagctg cgctgtgcac ctttttgtta aataaagaaa tgtatctaaa + 9841 gttgcgtagt gatgtgctat tacctcttac gcaatataat agatacttag ctctttataa + 9901 taagtacaag tattttagtg gagcaatgga tacaactagc tacagagaag ctgcttgttg + 9961 tcatctcgca aaggctctca atgacttcag taactcaggt tctgatgttc tttaccaacc + 10021 accacaaacc tctatcacct cagctgtttt gcagagtggt tttagaaaaa tggcattccc + 10081 atctggtaaa gttgagggtt gtatggtaca agtaacttgt ggtacaacta cacttaacgg + 10141 tctttggctt gatgacgtag tttactgtcc aagacatgtg atctgcacct ctgaagacat + 10201 gcttaaccct aattatgaag atttactcat tcgtaagtct aatcataatt tcttggtaca + 10261 ggctggtaat gttcaactca gggttattgg acattctatg caaaattgtg tacttaagct + 10321 taaggttgat acagccaatc ctaagacacc taagtataag tttgttcgca ttcaaccagg + 10381 acagactttt tcagtgttag cttgttacaa tggttcacca tctggtgttt accaatgtgc + 10441 tatgaggccc aatttcacta ttaagggttc attccttaat ggttcatgtg gtagtgttgg + 10501 ttttaacata gattatgact gtgtctcttt ttgttacatg caccatatgg aattaccaac + 10561 tggagttcat gctggcacag acttagaagg taacttttat ggaccttttg ttgacaggca + 10621 aacagcacaa gcagctggta cggacacaac tattacagtt aatgttttag cttggttgta + 10681 cgctgctgtt ataaatggag acaggtggtt tctcaatcga tttaccacaa ctcttaatga + 10741 ctttaacctt gtggctatga agtacaatta tgaacctcta acacaagacc atgttgacat + 10801 actaggacct ctttctgctc aaactggaat tgccgtttta gatatgtgtg cttcattaaa + 10861 agaattactg caaaatggta tgaatggacg taccatattg ggtagtgctt tattagaaga + 10921 tgaatttaca ccttttgatg ttgttagaca atgctcaggt gttactttcc aaagtgcagt + 10981 gaaaagaaca atcaagggta cacaccactg gttgttactc acaattttga cttcactttt + 11041 agttttagtc cagagtactc aatggtcttt gttctttttt ttgtatgaaa atgccttttt + 11101 accttttgct atgggtatta ttgctatgtc tgcttttgca atgatgtttg tcaaacataa + 11161 gcatgcattt ctctgtttgt ttttgttacc ttctcttgcc actgtagctt attttaatat + 11221 ggtctatatg cctgctagtt gggtgatgcg tattatgaca tggttggata tggttgatac + 11281 tagtttgtct ggttttaagc taaaagactg tgttatgtat gcatcagctg tagtgttact + 11341 aatccttatg acagcaagaa ctgtgtatga tgatggtgct aggagagtgt ggacacttat + 11401 gaatgtcttg acactcgttt ataaagttta ttatggtaat gctttagatc aagccatttc + 11461 catgtgggct cttataatct ctgttacttc taactactca ggtgtagtta caactgtcat + 11521 gtttttggcc agaggtattg tttttatgtg tgttgagtat tgccctattt tcttcataac + 11581 tggtaataca cttcagtgta taatgctagt ttattgtttc ttaggctatt tttgtacttg + 11641 ttactttggc ctcttttgtt tactcaaccg ctactttaga ctgactcttg gtgtttatga + 11701 ttacttagtt tctacacagg agtttagata tatgaattca cagggactac tcccacccaa + 11761 gaatagcata gatgccttca aactcaacat taaattgttg ggtgttggtg gcaaaccttg + 11821 tatcaaagta gccactgtac agtctaaaat gtcagatgta aagtgcacat cagtagtctt + 11881 actctcagtt ttgcaacaac tcagagtaga atcatcatct aaattgtggg ctcaatgtgt + 11941 ccagttacac aatgacattc tcttagctaa agatactact gaagcctttg aaaaaatggt + 12001 ttcactactt tctgttttgc tttccatgca gggtgctgta gacataaaca agctttgtga + 12061 agaaatgctg gacaacaggg caaccttaca agctatagcc tcagagttta gttcccttcc + 12121 atcatatgca gcttttgcta ctgctcaaga agcttatgag caggctgttg ctaatggtga + 12181 ttctgaagtt gttcttaaaa agttgaagaa gtctttgaat gtggctaaat ctgaatttga + 12241 ccgtgatgca gccatgcaac gtaagttgga aaagatggct gatcaagcta tgacccaaat + 12301 gtataaacag gctagatctg aggacaagag ggcaaaagtt actagtgcta tgcagacaat + 12361 gcttttcact atgcttagaa agttggataa tgatgcactc aacaacatta tcaacaatgc + 12421 aagagatggt tgtgttccct tgaacataat acctcttaca acagcagcca aactaatggt + 12481 tgtcatacca gactataaca catataaaaa tacgtgtgat ggtacaacat ttacttatgc + 12541 atcagcattg tgggaaatcc aacaggttgt agatgcagat agtaaaattg ttcaacttag + 12601 tgaaattagt atggacaatt cacctaattt agcatggcct cttattgtaa cagctttaag + 12661 ggccaattct gctgtcaaat tacagaataa tgagcttagt cctgttgcac tacgacagat + 12721 gtcttgtgct gccggtacta cacaaactgc ttgcactgat gacaatgcgt tagcttacta + 12781 caacacaaca aagggaggta ggtttgtact tgcactgtta tccgatttac aggatttgaa + 12841 atgggctaga ttccctaaga gtgatggaac tggtactatc tatacagaac tggaaccacc + 12901 ttgtaggttt gttacagaca cacctaaagg tcctaaagtg aagtatttat actttattaa + 12961 aggattaaac aacctaaata gaggtatggt acttggtagt ttagctgcca cagtacgtct + 13021 acaagctggt aatgcaacag aagtgcctgc caattcaact gtattatctt tctgtgcttt + 13081 tgctgtagat gctgctaaag cttacaaaga ttatctagct agtgggggac aaccaatcac + 13141 taattgtgtt aagatgttgt gtacacacac tggtactggt caggcaataa cagttacacc + 13201 ggaagccaat atggatcaag aatcctttgg tggtgcatcg tgttgtctgt actgccgttg + 13261 ccacatagat catccaaatc ctaaaggatt ttgtgactta aaaggtaagt atgtacaaat + 13321 acctacaact tgtgctaatg accctgtggg ttttacactt aaaaacacag tctgtaccgt + 13381 ctgcggtatg tggaaaggtt atggctgtag ttgtgatcaa ctccgcgaac ccatgcttca + 13441 gtcagctgat gcacaatcgt ttttaaacgg gtttgcggtg taagtgcagc ccgtcttaca + 13501 ccgtgcggca caggcactag tactgatgtc gtatacaggg cttttgacat ctacaatgat + 13561 aaagtagctg gttttgctaa attcctaaaa actaattgtt gtcgcttcca agaaaaggac + 13621 gaagatgaca atttaattga ttcttacttt gtagttaaga gacacacttt ctctaactac + 13681 caacatgaag aaacaattta taatttactt aaggattgtc cagctgttgc taaacatgac + 13741 ttctttaagt ttagaataga cggtgacatg gtaccacata tatcacgtca acgtcttact + 13801 aaatacacaa tggcagacct cgtctatgct ttaaggcatt ttgatgaagg taattgtgac + 13861 acattaaaag aaatacttgt cacatacaat tgttgtgatg atgattattt caataaaaag + 13921 gactggtatg attttgtaga aaacccagat atattacgcg tatacgccaa cttaggtgaa + 13981 cgtgtacgcc aagctttgtt aaaaacagta caattctgtg atgccatgcg aaatgctggt + 14041 attgttggtg tactgacatt agataatcaa gatctcaatg gtaactggta tgatttcggt + 14101 gatttcatac aaaccacgcc aggtagtgga gttcctgttg tagattctta ttattcattg + 14161 ttaatgccta tattaacctt gaccagggct ttaactgcag agtcacatgt tgacactgac + 14221 ttaacaaagc cttacattaa gtgggatttg ttaaaatatg acttcacgga agagaggtta + 14281 aaactctttg accgttattt taaatattgg gatcagacat accacccaaa ttgtgttaac + 14341 tgtttggatg acagatgcat tctgcattgt gcaaacttta atgttttatt ctctacagtg + 14401 ttcccaccta caagttttgg accactagtg agaaaaatat ttgttgatgg tgttccattt + 14461 gtagtttcaa ctggatacca cttcagagag ctaggtgttg tacataatca ggatgtaaac + 14521 ttacatagct ctagacttag ttttaaggaa ttacttgtgt atgctgctga ccctgctatg + 14581 cacgctgctt ctggtaatct attactagat aaacgcacta cgtgcttttc agtagctgca + 14641 cttactaaca atgttgcttt tcaaactgtc aaacccggta attttaacaa agacttctat + 14701 gactttgctg tgtctaaggg tttctttaag gaaggaagtt ctgttgaatt aaaacacttc + 14761 ttctttgctc aggatggtaa tgctgctatc agcgattatg actactatcg ttataatcta + 14821 ccaacaatgt gtgatatcag acaactacta tttgtagttg aagttgttga taagtacttt + 14881 gattgttacg atggtggctg tattaatgct aaccaagtca tcgtcaacaa cctagacaaa + 14941 tcagctggtt ttccatttaa taaatggggt aaggctagac tttattatga ttcaatgagt + 15001 tatgaggatc aagatgcact tttcgcatat acaaaacgta atgtcatccc tactataact + 15061 caaatgaatc ttaagtatgc cattagtgca aagaatagag ctcgcaccgt agctggtgtc + 15121 tctatctgta gtactatgac caatagacag tttcatcaaa aattattgaa atcaatagcc + 15181 gccactagag gagctactgt agtaattgga acaagcaaat tctatggtgg ttggcacaac + 15241 atgttaaaaa ctgtttatag tgatgtagaa aaccctcacc ttatgggttg ggattatcct + 15301 aaatgtgata gagccatgcc taacatgctt agaattatgg cctcacttgt tcttgctcgc + 15361 aaacatacaa cgtgttgtag cttgtcacac cgtttctata gattagctaa tgagtgtgct + 15421 caagtattga gtgaaatggt catgtgtggc ggttcactat atgttaaacc aggtggaacc + 15481 tcatcaggag atgccacaac tgcttatgct aatagtgttt ttaacatttg tcaagctgtc + 15541 acggccaatg ttaatgcact tttatctact gatggtaaca aaattgccga taagtatgtc + 15601 cgcaatttac aacacagact ttatgagtgt ctctatagaa atagagatgt tgacacagac + 15661 tttgtgaatg agttttacgc atatttgcgt aaacatttct caatgatgat actctctgac + 15721 gatgctgttg tgtgtttcaa tagcacttat gcatctcaag gtctagtggc tagcataaag + 15781 aactttaagt cagttcttta ttatcaaaac aatgttttta tgtctgaagc aaaatgttgg + 15841 actgagactg accttactaa aggacctcat gaattttgct ctcaacatac aatgctagtt + 15901 aaacagggtg atgattatgt gtaccttcct tacccagatc catcaagaat cctaggggcc + 15961 ggctgttttg tagatgatat cgtaaaaaca gatggtacac ttatgattga acggttcgtg + 16021 tctttagcta tagatgctta cccacttact aaacatccta atcaggagta tgctgatgtc + 16081 tttcatttgt acttacaata cataagaaag ctacatgatg agttaacagg acacatgtta + 16141 gacatgtatt ctgttatgct tactaatgat aacacttcaa ggtattggga acctgagttt + 16201 tatgaggcta tgtacacacc gcatacagtc ttacaggctg ttggggcttg tgttctttgc + 16261 aattcacaga cttcattaag atgtggtgct tgcatacgta gaccattctt atgttgtaaa + 16321 tgctgttacg accatgtcat atcaacatca cataaattag tcttgtctgt taatccgtat + 16381 gtttgcaatg ctccaggttg tgatgtcaca gatgtgactc aactttactt aggaggtatg + 16441 agctattatt gtaaatcaca taaaccaccc attagttttc cattgtgtgc taatggacaa + 16501 gtttttggtt tatataaaaa tacatgtgtt ggtagcgata atgttactga ctttaatgca + 16561 attgcaacat gtgactggac aaatgctggt gattacattt tagctaacac ctgtactgaa + 16621 agactcaagc tttttgcagc agaaacgctc aaagctactg aggagacatt taaactgtct + 16681 tatggtattg ctactgtacg tgaagtgctg tctgacagag aattacatct ttcatgggaa + 16741 gttggtaaac ctagaccacc acttaaccga aattatgtct ttactggtta tcgtgtaact + 16801 aaaaacagta aagtacaaat aggagagtac acctttgaaa aaggtgacta tggtgatgct + 16861 gttgtttacc gaggtacaac aacttacaaa ttaaatgttg gtgattattt tgtgctgaca + 16921 tcacatacag taatgccatt aagtgcacct acactagtgc cacaagagca ctatgttaga + 16981 attactggct tatacccaac actcaatatc tcagatgagt tttctagcaa tgttgcaaat + 17041 tatcaaaagg ttggtatgca aaagtattct acactccagg gaccacctgg tactggtaag + 17101 agtcattttg ctattggcct agctctctac tacccttctg ctcgcatagt gtatacagct + 17161 tgctctcatg ccgctgttga tgcactatgt gagaaggcat taaaatattt gcctatagat + 17221 aaatgtagta gaattatacc tgcacgtgct cgtgtagagt gttttgataa attcaaagtg + 17281 aattcaacat tagaacagta tgtcttttgt actgtaaatg cattgcctga gacgacagca + 17341 gatatagttg tctttgatga aatttcaatg gccacaaatt atgatttgag tgttgtcaat + 17401 gccagattac gtgctaagca ctatgtgtac attggcgacc ctgctcaatt acctgcacca + 17461 cgcacattgc taactaaggg cacactagaa ccagaatatt tcaattcagt gtgtagactt + 17521 atgaaaacta taggtccaga catgttcctc ggaacttgtc ggcgttgtcc tgctgaaatt + 17581 gttgacactg tgagtgcttt ggtttatgat aataagctta aagcacataa agacaaatca + 17641 gctcaatgct ttaaaatgtt ttataagggt gttatcacgc atgatgtttc atctgcaatt + 17701 aacaggccac aaataggcgt ggtaagagaa ttccttacac gtaaccctgc ttggagaaaa + 17761 gctgtcttta tttcacctta taattcacag aatgctgtag cctcaaagat tttgggacta + 17821 ccaactcaaa ctgttgattc atcacagggc tcagaatatg actatgtcat attcactcaa + 17881 accactgaaa cagctcactc ttgtaatgta aacagattta atgttgctat taccagagca + 17941 aaagtaggca tactttgcat aatgtctgat agagaccttt atgacaagtt gcaatttaca + 18001 agtcttgaaa ttccacgtag gaatgtggca actttacaag ctgaaaatgt aacaggactc + 18061 tttaaagatt gtagtaaggt aatcactggg ttacatccta cacaggcacc tacacacctc + 18121 agtgttgaca ctaaattcaa aactgaaggt ttatgtgttg acatacctgg catacctaag + 18181 gacatgacct atagaagact catctctatg atgggtttta aaatgaatta tcaagttaat + 18241 ggttacccta acatgtttat cacccgcgaa gaagctataa gacatgtacg tgcatggatt + 18301 ggcttcgatg tcgaggggtg tcatgctact agagaagctg ttggtaccaa tttaccttta + 18361 cagctaggtt tttctacagg tgttaaccta gttgctgtac ctacaggtta tgttgataca + 18421 cctaataata cagatttttc cagagttagt gctaaaccac cgcctggaga tcaatttaaa + 18481 cacctcatac cacttatgta caaaggactt ccttggaatg tagtgcgtat aaagattgta + 18541 caaatgttaa gtgacacact taaaaatctc tctgacagag tcgtatttgt cttatgggca + 18601 catggctttg agttgacatc tatgaagtat tttgtgaaaa taggacctga gcgcacctgt + 18661 tgtctatgtg atagacgtgc cacatgcttt tccactgctt cagacactta tgcctgttgg + 18721 catcattcta ttggatttga ttacgtctat aatccgttta tgattgatgt tcaacaatgg + 18781 ggttttacag gtaacctaca aagcaaccat gatctgtatt gtcaagtcca tggtaatgca + 18841 catgtagcta gttgtgatgc aatcatgact aggtgtctag ctgtccacga gtgctttgtt + 18901 aagcgtgttg actggactat tgaatatcct ataattggtg atgaactgaa gattaatgcg + 18961 gcttgtagaa aggttcaaca catggttgtt aaagctgcat tattagcaga caaattccca + 19021 gttcttcacg acattggtaa ccctaaagct attaagtgtg tacctcaagc tgatgtagaa + 19081 tggaagttct atgatgcaca gccttgtagt gacaaagctt ataaaataga agaattattc + 19141 tattcttatg ccacacattc tgacaaattc acagatggtg tatgcctatt ttggaattgc + 19201 aatgtcgata gatatcctgc taattccatt gtttgtagat ttgacactag agtgctatct + 19261 aaccttaact tgcctggttg tgatggtggc agtttgtatg taaataaaca tgcattccac + 19321 acaccagctt ttgataaaag tgcttttgtt aatttaaaac aattaccatt tttctattac + 19381 tctgacagtc catgtgagtc tcatggaaaa caagtagtgt cagatataga ttatgtacca + 19441 ctaaagtctg ctacgtgtat aacacgttgc aatttaggtg gtgctgtctg tagacatcat + 19501 gctaatgagt acagattgta tctcgatgct tataacatga tgatctcagc tggctttagc + 19561 ttgtgggttt acaaacaatt tgatacttat aacctctgga acacttttac aagacttcag + 19621 agtttagaaa atgtggcttt taatgttgta aataagggac actttgatgg acaacagggt + 19681 gaagtaccag tttctatcat taataacact gtttacacaa aagttgatgg tgttgatgta + 19741 gaattgtttg aaaataaaac aacattacct gttaatgtag catttgagct ttgggctaag + 19801 cgcaacatta aaccagtacc agaggtgaaa atactcaata atttgggtgt ggacattgct + 19861 gctaatactg tgatctggga ctacaaaaga gatgctccag cacatatatc tactattggt + 19921 gtttgttcta tgactgacat agccaagaaa ccaactgaaa cgatttgtgc accactcact + 19981 gtcttttttg atggtagagt tgatggtcaa gtagacttat ttagaaatgc ccgtaatggt + 20041 gttcttatta cagaaggtag tgttaaaggt ttacaaccat ctgtaggtcc caaacaagct + 20101 agtcttaatg gagtcacatt aattggagaa gccgtaaaaa cacagttcaa ttattataag + 20161 aaagttgatg gtgttgtcca acaattacct gaaacttact ttactcagag tagaaattta + 20221 caagaattta aacccaggag tcaaatggaa attgatttct tagaattagc tatggatgaa + 20281 ttcattgaac ggtataaatt agaaggctat gccttcgaac atatcgttta tggagatttt + 20341 agtcatagtc agttaggtgg tttacatcta ctgattggac tagctaaacg ttttaaggaa + 20401 tcaccttttg aattagaaga ttttattcct atggacagta cagttaaaaa ctatttcata + 20461 acagatgcgc aaacaggttc atctaagtgt gtgtgttctg ttattgattt attacttgat + 20521 gattttgttg aaataataaa atcccaagat ttatctgtag tttctaaggt tgtcaaagtg + 20581 actattgact atacagaaat ttcatttatg ctttggtgta aagatggcca tgtagaaaca + 20641 ttttacccaa aattacaatc tagtcaagcg tggcaaccgg gtgttgctat gcctaatctt + 20701 tacaaaatgc aaagaatgct attagaaaag tgtgaccttc aaaattatgg tgatagtgca + 20761 acattaccta aaggcataat gatgaatgtc gcaaaatata ctcaactgtg tcaatattta + 20821 aacacattaa cattagctgt accctataat atgagagtta tacattttgg tgctggttct + 20881 gataaaggag ttgcaccagg tacagctgtt ttaagacagt ggttgcctac gggtacgctg + 20941 cttgtcgatt cagatcttaa tgactttgtc tctgatgcag attcaacttt gattggtgat + 21001 tgtgcaactg tacatacagc taataaatgg gatctcatta ttagtgatat gtacgaccct + 21061 aagactaaaa atgttacaaa agaaaatgac tctaaagagg gttttttcac ttacatttgt + 21121 gggtttatac aacaaaagct agctcttgga ggttccgtgg ctataaagat aacagaacat + 21181 tcttggaatg ctgatcttta taagctcatg ggacacttcg catggtggac agcctttgtt + 21241 actaatgtga atgcgtcatc atctgaagca tttttaattg gatgtaatta tcttggcaaa + 21301 ccacgcgaac aaatagatgg ttatgtcatg catgcaaatt acatattttg gaggaataca + 21361 aatccaattc agttgtcttc ctattcttta tttgacatga gtaaatttcc ccttaaatta + 21421 aggggtactg ctgttatgtc tttaaaagaa ggtcaaatca atgatatgat tttatctctt + 21481 cttagtaaag gtagacttat aattagagaa aacaacagag ttgttatttc tagtgatgtt + 21541 cttgttaaca actaaacgaa caatgtttgt ttttcttgtt ttattgccac tagtctctag + 21601 tcagtgtgtt aatcttacaa ccagaactca attaccccct gcatacacta attctttcac + 21661 acgtggtgtt tattaccctg acaaagtttt cagatcctca gttttacatt caactcagga + 21721 cttgttctta cctttctttt ccaatgttac ttggttccat gctatacatg tctctgggac + 21781 caatggtact aagaggtttg ataaccctgt cctaccattt aatgatggtg tttattttgc + 21841 ttccactgag aagtctaaca taataagagg ctggattttt ggtactactt tagattcgaa + 21901 gacccagtcc ctacttattg ttaataacgc tactaatgtt gttattaaag tctgtgaatt + 21961 tcaattttgt aatgatccat ttttgggtgt ttattaccac aaaaacaaca aaagttggat + 22021 ggaaagtgag ttcagagttt attctagtgc gaataattgc acttttgaat atgtctctca + 22081 gccttttctt atggaccttg aaggaaaaca gggtaatttc aaaaatctta gggaatttgt + 22141 gtttaagaat attgatggtt attttaaaat atattctaag cacacgccta ttaatttagt + 22201 gcgtgatctc cctcagggtt tttcggcttt agaaccattg gtagatttgc caataggtat + 22261 taacatcact aggtttcaaa ctttacttgc tttacataga agttatttga ctcctggtga + 22321 ttcttcttca ggttggacag ctggtgctgc agcttattat gtgggttatc ttcaacctag + 22381 gacttttcta ttaaaatata atgaaaatgg aaccattaca gatgctgtag actgtgcact + 22441 tgaccctctc tcagaaacaa agtgtacgtt gaaatccttc actgtagaaa aaggaatcta + 22501 tcaaacttct aactttagag tccaaccaac agaatctatt gttagatttc ctaatattac + 22561 aaacttgtgc ccttttggtg aagtttttaa cgccaccaga tttgcatctg tttatgcttg + 22621 gaacaggaag agaatcagca actgtgttgc tgattattct gtcctatata attccgcatc + 22681 attttccact tttaagtgtt atggagtgtc tcctactaaa ttaaatgatc tctgctttac + 22741 taatgtctat gcagattcat ttgtaattag aggtgatgaa gtcagacaaa tcgctccagg + 22801 gcaaactgga aagattgctg attataatta taaattacca gatgatttta caggctgcgt + 22861 tatagcttgg aattctaaca atcttgattc taaggttggt ggtaattata attacctgta + 22921 tagattgttt aggaagtcta atctcaaacc ttttgagaga gatatttcaa ctgaaatcta + 22981 tcaggccggt agcacacctt gtaatggtgt tgaaggtttt aattgttact ttcctttaca + 23041 atcatatggt ttccaaccca ctaatggtgt tggttaccaa ccatacagag tagtagtact + 23101 ttcttttgaa cttctacatg caccagcaac tgtttgtgga cctaaaaagt ctactaattt + 23161 ggttaaaaac aaatgtgtca atttcaactt caatggttta acaggcacag gtgttcttac + 23221 tgagtctaac aaaaagtttc tgcctttcca acaatttggc agagacattg ctgacactac + 23281 tgatgctgtc cgtgatccac agacacttga gattcttgac attacaccat gttcttttgg + 23341 tggtgtcagt gttataacac caggaacaaa tacttctaac caggttgctg ttctttatca + 23401 ggatgttaac tgcacagaag tccctgttgc tattcatgca gatcaactta ctcctacttg + 23461 gcgtgtttat tctacaggtt ctaatgtttt tcaaacacgt gcaggctgtt taataggggc + 23521 tgaacatgtc aacaactcat atgagtgtga catacccatt ggtgcaggta tatgcgctag + 23581 ttatcagact cagactaatt ctcctcggcg ggcacgtagt gtagctagtc aatccatcat + 23641 tgcctacact atgtcacttg gtgcagaaaa ttcagttgct tactctaata actctattgc + 23701 catacccaca aattttacta ttagtgttac cacagaaatt ctaccagtgt ctatgaccaa + 23761 gacatcagta gattgtacaa tgtacatttg tggtgattca actgaatgca gcaatctttt + 23821 gttgcaatat ggcagttttt gtacacaatt aaaccgtgct ttaactggaa tagctgttga + 23881 acaagacaaa aacacccaag aagtttttgc acaagtcaaa caaatttaca aaacaccacc + 23941 aattaaagat tttggtggtt ttaatttttc acaaatatta ccagatccat caaaaccaag + 24001 caagaggtca tttattgaag atctactttt caacaaagtg acacttgcag atgctggctt + 24061 catcaaacaa tatggtgatt gccttggtga tattgctgct agagacctca tttgtgcaca + 24121 aaagtttaac ggccttactg ttttgccacc tttgctcaca gatgaaatga ttgctcaata + 24181 cacttctgca ctgttagcgg gtacaatcac ttctggttgg acctttggtg caggtgctgc + 24241 attacaaata ccatttgcta tgcaaatggc ttataggttt aatggtattg gagttacaca + 24301 gaatgttctc tatgagaacc aaaaattgat tgccaaccaa tttaatagtg ctattggcaa + 24361 aattcaagac tcactttctt ccacagcaag tgcacttgga aaacttcaag atgtggtcaa + 24421 ccaaaatgca caagctttaa acacgcttgt taaacaactt agctccaatt ttggtgcaat + 24481 ttcaagtgtt ttaaatgata tcctttcacg tcttgacaaa gttgaggctg aagtgcaaat + 24541 tgataggttg atcacaggca gacttcaaag tttgcagaca tatgtgactc aacaattaat + 24601 tagagctgca gaaatcagag cttctgctaa tcttgctgct actaaaatgt cagagtgtgt + 24661 acttggacaa tcaaaaagag ttgatttttg tggaaagggc tatcatctta tgtccttccc + 24721 tcagtcagca cctcatggtg tagtcttctt gcatgtgact tatgtccctg cacaagaaaa + 24781 gaacttcaca actgctcctg ccatttgtca tgatggaaaa gcacactttc ctcgtgaagg + 24841 tgtctttgtt tcaaatggca cacactggtt tgtaacacaa aggaattttt atgaaccaca + 24901 aatcattact acagacaaca catttgtgtc tggtaactgt gatgttgtaa taggaattgt + 24961 caacaacaca gtttatgatc ctttgcaacc tgaattagac tcattcaagg aggagttaga + 25021 taaatatttt aagaatcata catcaccaga tgttgattta ggtgacatct ctggcattaa + 25081 tgcttcagtt gtaaacattc aaaaagaaat tgaccgcctc aatgaggttg ccaagaattt + 25141 aaatgaatct ctcatcgatc tccaagaact tggaaagtat gagcagtata taaaatggcc + 25201 atggtacatt tggctaggtt ttatagctgg cttgattgcc atagtaatgg tgacaattat + 25261 gctttgctgt atgaccagtt gctgtagttg tctcaagggc tgttgttctt gtggatcctg + 25321 ctgcaaattt gatgaagacg actctgagcc agtgctcaaa ggagtcaaat tacattacac + 25381 ataaacgaac ttatggattt gtttatgaga atcttcacaa ttggaactgt aactttgaag + 25441 caaggtgaaa tcaaggatgc tactccttca gattttgttc gcgctactgc aacgataccg + 25501 atacaagcct cactcccttt cggatggctt attgttggcg ttgcacttct tgctgttttt + 25561 cagagcgctt ccaaaatcat aaccctcaaa aagagatggc aactagcact ctccaagggt + 25621 gttcactttg tttgcaactt gctgttgttg tttgtaacag tttactcaca ccttttgctc + 25681 gttgctgctg gccttgaagc cccttttctc tatctttatg ctttagtcta cttcttgcag + 25741 agtataaact ttgtaagaat aataatgagg ctttggcttt gctggaaatg ccgttccaaa + 25801 aacccattac tttatgatgc caactatttt ctttgctggc atactaattg ttacgactat + 25861 tgtatacctt acaatagtgt aacttcttca attgtcatta cttcaggtga tggcacaaca + 25921 agtcctattt ctgaacatga ctaccagatt ggtggttata ctgaaaaatg ggaatctgga + 25981 gtaaaagact gtgttgtatt acacagttac ttcacttcag actattacca gctgtactca + 26041 actcaattga gtacagacac tggtgttgaa catgttacct tcttcatcta caataaaatt + 26101 gttgatgagc ctgaagaaca tgtccaaatt cacacaatcg acggttcatc cggagttgtt + 26161 aatccagtaa tggaaccaat ttatgatgaa ccgacgacga ctactagcgt gcctttgtaa + 26221 gcacaagctg atgagtacga acttatgtac tcattcgttt cggaagagac aggtacgtta + 26281 atagttaata gcgtacttct ttttcttgct ttcgtggtat tcttgctagt tacactagcc + 26341 atccttactg cgcttcgatt gtgtgcgtac tgctgcaata ttgttaacgt gagtcttgta + 26401 aaaccttctt tttacgttta ctctcgtgtt aaaaatctga attcttctag agttcctgat + 26461 cttctggtct aaacgaacta aatattatat tagtttttct gtttggaact ttaattttag + 26521 ccatggcaga ttccaacggt actattaccg ttgaagagct taaaaagctc cttgaacaat + 26581 ggaacctagt aataggtttc ctattcctta catggatttg tcttctacaa tttgcctatg + 26641 ccaacaggaa taggtttttg tatataatta agttaatttt cctctggctg ttatggccag + 26701 taactttagc ttgttttgtg cttgctgctg tttacagaat aaattggatc accggtggaa + 26761 ttgctatcgc aatggcttgt cttgtaggct tgatgtggct cagctacttc attgcttctt + 26821 tcagactgtt tgcgcgtacg cgttccatgt ggtcattcaa tccagaaact aacattcttc + 26881 tcaacgtgcc actccatggc actattctga ccagaccgct tctagaaagt gaactcgtaa + 26941 tcggagctgt gatccttcgt ggacatcttc gtattgctgg acaccatcta ggacgctgtg + 27001 acatcaagga cctgcctaaa gaaatcactg ttgctacatc acgaacgctt tcttattaca + 27061 aattgggagc ttcgcagcgt gtagcaggtg actcaggttt tgctgcatac agtcgctaca + 27121 ggattggcaa ctataaatta aacacagacc attccagtag cagtgacaat attgctttgc + 27181 ttgtacagta agtgacaaca gatgtttcat ctcgttgact ttcaggttac tatagcagag + 27241 atattactaa ttattatgag gacttttaaa gtttccattt ggaatcttga ttacatcata + 27301 aacctcataa ttaaaaattt atctaagtca ctaactgaga ataaatattc tcaattagat + 27361 gaagagcaac caatggagat tgattaaacg aacatgaaaa ttattctttt cttggcactg + 27421 ataacactcg ctacttgtga gctttatcac taccaagagt gtgttagagg tacaacagta + 27481 cttttaaaag aaccttgctc ttctggaaca tacgagggca attcaccatt tcatcctcta + 27541 gctgataaca aatttgcact gacttgcttt agcactcaat ttgcttttgc ttgtcctgac + 27601 ggcgtaaaac acgtctatca gttacgtgcc agatcagttt cacctaaact gttcatcaga + 27661 caagaggaag ttcaagaact ttactctcca atttttctta ttgttgcggc aatagtgttt + 27721 ataacacttt gcttcacact caaaagaaag acagaatgat tgaactttca ttaattgact + 27781 tctatttgtg ctttttagcc tttctgctat tccttgtttt aattatgctt attatctttt + 27841 ggttctcact tgaactgcaa gatcataatg aaacttgtca cgcctaaacg aacatgaaat + 27901 ttcttgtttt cttaggaatc atcacaactg tagctgcatt tcaccaagaa tgtagtttac + 27961 agtcatgtac tcaacatcaa ccatatgtag ttgatgaccc gtgtcctatt cacttctatt + 28021 ctaaatggta tattagagta ggagctagaa aatcagcacc tttaattgaa ttgtgcgtgg + 28081 atgaggctgg ttctaaatca cccattcagt acatcgatat cggtaattat acagtttcct + 28141 gtttaccttt tacaattaat tgccaggaac ctaaattggg tagtcttgta gtgcgttgtt + 28201 cgttctatga agacttttta gagtatcatg acgttcgtgt tgttttagat ttcatctaaa + 28261 cgaacaaact aaaatgtctg ataatggacc ccaaaatcag cgaaatgcac cccgcattac + 28321 gtttggtgga ccctcagatt caactggcag taaccagaat ggagaacgca gtggggcgcg + 28381 atcaaaacaa cgtcggcccc aaggtttacc caataatact gcgtcttggt tcaccgctct + 28441 cactcaacat ggcaaggaag accttaaatt ccctcgagga caaggcgttc caattaacac + 28501 caatagcagt ccagatgacc aaattggcta ctaccgaaga gctaccagac gaattcgtgg + 28561 tggtgacggt aaaatgaaag atctcagtcc aagatggtat ttctactacc taggaactgg + 28621 gccagaagct ggacttccct atggtgctaa caaagacggc atcatatggg ttgcaactga + 28681 gggagccttg aatacaccaa aagatcacat tggcacccgc aatcctgcta acaatgctgc + 28741 aatcgtgcta caacttcctc aaggaacaac attgccaaaa ggcttctacg cagaagggag + 28801 cagaggcggc agtcaagcct cttctcgttc ctcatcacgt agtcgcaaca gttcaagaaa + 28861 ttcaactcca ggcagcagta ggggaacttc tcctgctaga atggctggca atggcggtga + 28921 tgctgctctt gctttgctgc tgcttgacag attgaaccag cttgagagca aaatgtctgg + 28981 taaaggccaa caacaacaag gccaaactgt cactaagaaa tctgctgctg aggcttctaa + 29041 gaagcctcgg caaaaacgta ctgccactaa agcatacaat gtaacacaag ctttcggcag + 29101 acgtggtcca gaacaaaccc aaggaaattt tggggaccag gaactaatca gacaaggaac + 29161 tgattacaaa cattggccgc aaattgcaca atttgccccc agcgcttcag cgttcttcgg + 29221 aatgtcgcgc attggcatgg aagtcacacc ttcgggaacg tggttgacct acacaggtgc + 29281 catcaaattg gatgacaaag atccaaattt caaagatcaa gtcattttgc tgaataagca + 29341 tattgacgca tacaaaacat tcccaccaac agagcctaaa aaggacaaaa agaagaaggc + 29401 tgatgaaact caagccttac cgcagagaca gaagaaacag caaactgtga ctcttcttcc + 29461 tgctgcagat ttggatgatt tctccaaaca attgcaacaa tccatgagca gtgctgactc + 29521 aactcaggcc taaactcatg cagaccacac aaggcagatg ggctatataa acgttttcgc + 29581 ttttccgttt acgatatata gtctactctt gtgcagaatg aattctcgta actacatagc + 29641 acaagtagat gtagttaact ttaatctcac atagcaatct ttaatcagtg tgtaacatta + 29701 gggaggactt gaaagagcca ccacattttc accgaggcca cgcggagtac gatcgagtgt + 29761 acagtgaaca atgctaggga gagctgccta tatggaagag ccctaatgtg taaaattaat + 29821 tttagtagtg ctatccccat gtgattttaa tagcttctta ggagaatgac aaaaaaaaaa + 29881 aaaaaaaaaa aaaaaaaaaa aaa +// + diff --git a/workflows/resources/README b/workflows/resources/README new file mode 100644 index 0000000..c26158f --- /dev/null +++ b/workflows/resources/README @@ -0,0 +1,30 @@ +Descriptions of the included resources: + +- MN908947.fa +- MN908947.gb + These are the reference genome fasta and genbank files, used to place genomes in reference coordinates for alignment/trimming steps + and for typing variants specified relative to the reference. + +- date_corrections.csv + central_sample_id,sample_date + These overwrite the dates in MAJORA + +- resequencing_omissions.txt + These IDs have been resequenced and should be omitted + +- AAs.csv + AAmutation, nuc position in reference + Specific mutations to look for +- dels.csv + nuc position in reference, length + Specific deletions to look for + +- publish_recipes.json + A file of recipes to make the billions of output files. + Parsed by bin/publish_from_config.py + Each key in the dict represents an outdir + For that outdir, a list of outfiles is given, each represented by a dict + Outfile dicts can include fields: suffix, data, fasta, metadata_fields, where, and mutations + Either data in ["cog", "cog_global"] or fasta in ["unaligned", "aligned", "trimmed", "cog_global"] must be specified + Mutations is a bool, metadata_fields a list and all others are strings + diff --git a/workflows/resources/WH04.fa b/workflows/resources/WH04.fa new file mode 100644 index 0000000..406a27c --- /dev/null +++ b/workflows/resources/WH04.fa @@ -0,0 +1,2 @@ +>Wuhan/WH04/2020 +NNNNNNNNNNNNNNNNTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTTAAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACAACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTACAACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGATATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGTCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAAGAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCAGTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTCTCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACTAAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTCACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAANNNNNNNNNNNNNNNN diff --git a/workflows/resources/date_corrections.csv b/workflows/resources/date_corrections.csv new file mode 100644 index 0000000..3d68ea2 --- /dev/null +++ b/workflows/resources/date_corrections.csv @@ -0,0 +1,11 @@ +central_sample_id,sample_date +PHEC-20161,2020-03-04 +PHEC-2018F,2020-03-04 +PHEC-139C3,2020-03-21 +PHEC-153F0,2020-03-25 +PHEC-204C2,2020-03-05 +PHEC-20170,2020-03-04 +PHEC-16043,2020-03-27 +PHEC-14D5D,2020-03-24 +PHEC-14D4E,2020-03-24 +PHEC-13A57,2020-03-21 diff --git a/workflows/resources/dels.csv b/workflows/resources/dels.csv new file mode 100644 index 0000000..521b179 --- /dev/null +++ b/workflows/resources/dels.csv @@ -0,0 +1,2 @@ +1605,3 +21765,6 diff --git a/workflows/resources/empty_constellations.csv b/workflows/resources/empty_constellations.csv new file mode 100644 index 0000000..3c31ff9 --- /dev/null +++ b/workflows/resources/empty_constellations.csv @@ -0,0 +1 @@ +sequence_name diff --git a/workflows/resources/empty_mutations.csv b/workflows/resources/empty_mutations.csv new file mode 100644 index 0000000..3c31ff9 --- /dev/null +++ b/workflows/resources/empty_mutations.csv @@ -0,0 +1 @@ +sequence_name diff --git a/workflows/resources/empty_updown.csv b/workflows/resources/empty_updown.csv new file mode 100644 index 0000000..0e6ee7b --- /dev/null +++ b/workflows/resources/empty_updown.csv @@ -0,0 +1 @@ +query diff --git a/workflows/resources/gisaid_omissions.txt b/workflows/resources/gisaid_omissions.txt new file mode 100644 index 0000000..03b90b0 --- /dev/null +++ b/workflows/resources/gisaid_omissions.txt @@ -0,0 +1,364 @@ +# duplicate of Guangdong/20SF012/2020 / EPI_ISL_403932 as described here: +# http://virological.org/t/phylogenetic-analysis-of-23-ncov-2019-genomes-2020-01-23/335/5 +HKU-SZ-002a_2020 + +# erroneous collection date? (day before submission) +Hefei/2/2020|EPI_ISL_412026|China|Anhui|Hefei|2020-02-23 + +# too many errors +Russia/StPetersburg-3524/2020|EPI_ISL_415710|2020-03-15 +Guangzhou/GZMU0047/2020|EPI_ISL_414690|2020-02-25 +Henan/IVDC-HeN-002/2020|EPI_ISL_408487|2020-01-20 +Shandong/LY001/2020|EPI_ISL_414934|2020-01-21 +Shandong/LY002/2020|EPI_ISL_414935|2020-01-21 +Shanghai/IVDC-SH-001/2020|EPI_ISL_408483|2020-01-20 +Shenzhen/SZTH-001/2020|EPI_ISL_406592|2020-01-13 +Shenzhen/SZTH-004/2020|EPI_ISL_406595|2020-01-16 +USA/WA-UW53/2020|EPI_ISL_415618|2020-03-09 +USA/WA-UW65/2020|EPI_ISL_415593|2020-03-10 +Wuhan/HBCDC-HB-04/2019|EPI_ISL_412900|2019-12-30 +Wuhan/WH02/2019|EPI_ISL_406799|2019-12-31 +Belgium/MTR-03026/2020|EPI_ISL_416476|2020-03-02 +Malaysia/MKAK-CL-2020-5049/2020|EPI_ISL_416884|2020-02-20 +Malaysia/188407/2020|EPI_ISL_417918|2020-03-18 +Malaysia/190300/2020|EPI_ISL_417920||Malaysia|Kuala_Lumpur||2020-03-22 +Turkey/6224-Ankara1034/2020|EPI_ISL_417413|2020-03-17 +USA/WI-GMF-00237/2020|EPI_ISL_418189|2020-03-23 +Italy/UniMI02/2020|EPI_ISL_417446|2020-02-24 +Malaysia/186197/2020|EPI_ISL_417919||Malaysia|Kuala_Lumpur||2020-03-14 +USA/MN4-MDH4/2020|EPI_ISL_417189||USA|Minnesota||2020-03-09 +USA/WI-GMF-00227/2020|EPI_ISL_418186||USA|Wisconsin|La_Crosse_County|2020-03-23 +USA/WI-GMF-00049/2020|EPI_ISL_418185||USA|Wisconsin|La_Crosse_County|2020-03-18 +Canada/ON_PHL4232/2020|EPI_ISL_418374||Canada|Ontario||2020-03-11 +Senegal/640/2020|EPI_ISL_420079||Senegal|Mbour||2020-03-20 +Senegal/328/2020|EPI_ISL_420071||Senegal|Mbour||2020-03-17 +USA/WA-UW306/2020|EPI_ISL_418874||USA|Washington||2020-03-23 +Ecuador/HGSQ-USFQ-007/2020 +Ecuador/HGSQ-USFQ-010/2020 +USA/WI-GMF-00928/2020 +USA/WI-GMF-00384/2020 +USA/WA-UW-1821/2020 +USA/WA-UW45/2020 +India/1135/2020 + +# too many singleton mutations +Hong_Kong/case49_VM20002508/2020|EPI_ISL_414567|2020-02-10 +Shenzhen/SZTH-004/2020|EPI_ISL_406595|2020-01-16 +Belgium/VLM-03011/2020|EPI_ISL_415153|2020-03-03 +Belgium/BA-02291/2020|EPI_ISL_415159|2020-02-29 +Belgium/MTR-03021/2020|EPI_ISL_416467|2020-03-02 +Belgium/BA-02291/2020|EPI_ISL_415159|2020-02-29 +Hong_Kong/case49_VM20002508/2020|EPI_ISL_414567|2020-02-10 +USA/WA-UW156/2020|EPI_ISL_416694|2020-03-13 +USA/WA-UW132/2020|EPI_ISL_416670|2020-03-12 +Australia/VIC12/2020|EPI_ISL_416518|2020-03-16 +USA/CA-CDPH-UC28/2020|EPI_ISL_417332|2020-03-14 +USA/WA-UW306/2020|EPI_ISL_418874||USA|Washington||2020-03-23 + +# big indels +USA/WA-UW134/2020|EPI_ISL_416672|2020-03-10 +USA/WA-UW182/2020|EPI_ISL_416720|2020-03-13 +USA/WA-UW281/2020|EPI_ISL_418066|2020-03-14 + +Singapore/14Clin/2020|EPI_ISL_418998|2020-02-14 +Singapore/19/2020|EPI_ISL_419001|2020-03-02 +Singapore/22/2020|EPI_ISL_420099|2020-03-02 +Singapore/23/2020|EPI_ISL_420100|2020-03-02 +Singapore/18/2020|EPI_ISL_418999|2020-03-01 +Singapore/30/2020|EPI_ISL_420107|2020-03-09 +Singapore/12Clin/2020|EPI_ISL_418995|2020-02-28 +Singapore/16/2020|EPI_ISL_418997|2020-02-06 +Singapore/15/2020|EPI_ISL_418996|2020-01-27 +Singapore/21/2020|EPI_ISL_419000|2020-02-13 + +# ORF8 reversion? +Guangdong/GDSZ202013-P0014/2020|EPI_ISL_413865|2020-02-05 +Spain/Andalucia201373/2020|EPI_ISL_418244|2020-03-02 +USA/NY-NYUMC40/2020|EPI_ISL_419701|2020-03-18 + +# replicate sequencing +USA/WA1-A12/2020|EPI_ISL_407214|2020-01-19 +USA/WA1-F6/2020|EPI_ISL_407215|2020-01-19 +USA/WA1-A12/2020|EPI_ISL_407214|2020-01-25 +USA/WA1-F6/2020|EPI_ISL_407215|2020-01-25 +Italy/INMI1-cs/2020|EPI_ISL_410546|2020-01-31 +France/IDF0372-isl/2020|EPI_ISL_410720|2020-01-23 +France/IDF0515-isl/2020|EPI_ISL_410984|2020-01-29 +France/IDF0386/2020|EPI_ISL_411220|2020-01-28 +HKU-SZ-002a|MN938384|2020-01-10 + +# 3x replicate sequencing according to Verity but missing from GISAID now anyway - fixed?: +EPI_ISL_415695 +EPI_ISL_415694 +EPI_ISL_415693 + +# Large deletion in ORF8 - causes misplacement in A/B clade. Remove for now. Give lineage designation and force grouping if more appear? +Singapore/12/2020|EPI_ISL_414378|2020-02-17 +Singapore/13/2020|EPI_ISL_414379|2020-02-18 +Singapore/14/2020|EPI_ISL_414380|2020-02-19 +Taiwan/CGMH-CGU-02/2020|EPI_ISL_417518|2020-02-04 + +# date missing +# commented these out as we should remove using separate filter. +#South_Korea/SNU01/2020 +#Wuhan/HBCDC-HB-04/2019|EPI_ISL_412900|China|Hubei|Wuhan|2019-12-30 +#Netherlands/Coevorden_1363618/2020|Netherlands|Coevorden||2020 +#Netherlands/Tilburg_/2020|Netherlands|Tilburg||2020 +#China/WF0001/2020|EPI_ISL_413691|China|Shandong||2020-01 +#China/WF0002/2020|EPI_ISL_413692|China|Shandong||2020-01 +#China/WF0003/2020|EPI_ISL_413693|China|Shandong||2020-01 +#China/WF0004/2020|EPI_ISL_413694|China|Shandong||2020-01 +#China/WF0012/2020|EPI_ISL_413697|China|Shandong||2020-02 +#China/WF0014/2020|EPI_ISL_413711|China|Shandong||2020-02 +#China/WF0015/2020|EPI_ISL_413729|China|Shandong||2020-02 +#China/WF0016/2020|EPI_ISL_413746|China|Shandong||2020-02 +#China/WF0018/2020|EPI_ISL_413748|China|Shandong||2020-02 +#China/WF0019/2020|EPI_ISL_413749|China|Shandong||2020-02 +#China/WF0020/2020|EPI_ISL_413750|China|Shandong||2020-02 +#China/WF0021/2020|EPI_ISL_413751|China|Shandong||2020-02 +#China/WF0023/2020|EPI_ISL_413752|China|Shandong||2020-02 +#China/WF0024/2020|EPI_ISL_413753|China|Shandong||2020-02 +#China/WF0026/2020|EPI_ISL_413761|China|Shandong||2020-02 +#China/WF0028/2020|EPI_ISL_413791|China|Shandong||2020-02 +#China/WF0029/2020|EPI_ISL_413809|China|Shandong||2020-02 +#Japan/TKYE6182/2020|EPI_ISL_414511|2020-01 +#Netherlands/NoordBrabant_33/2020|EPI_ISL_414542|2020-03 +#Netherlands/NoordBrabant_34/2020|EPI_ISL_414543|2020-03 +#Canada/ON_PHL2223/2020|EPI_ISL_418381|2020 +#Canada/ON_PHL2259/2020|EPI_ISL_418344|2020 +#Canada/ON_PHL2273/2020|EPI_ISL_418383|2020 +#Canada/ON_PHL2294/2020|EPI_ISL_418384|2020 +#Canada/ON_PHL3476/2020|EPI_ISL_418380|2020 +#Canada/ON_PHL5930/2020|EPI_ISL_418382|2020 +#Netherlands/Limburg_7/2020|EPI_ISL_415464|2020 +#Netherlands/NA_4/2020|EPI_ISL_415493|2020 +#Netherlands/NA_5/2020|EPI_ISL_415494|2020 +#Netherlands/NoordBrabant_41/2020|EPI_ISL_415499|2020 +#Netherlands/NoordBrabant_51/2020|EPI_ISL_415507|2020 +#Netherlands/NoordBrabant_53/2020|EPI_ISL_415509|2020 +#Netherlands/NoordBrabant_61/2020|EPI_ISL_415517|2020 +#Netherlands/NoordBrabant_62/2020|EPI_ISL_415518|2020 +#Netherlands/NoordBrabant_63/2020|EPI_ISL_415519|2020 +#Netherlands/NoordBrabant_64/2020|EPI_ISL_415520|2020 +#Netherlands/NoordBrabant_65/2020|EPI_ISL_415521|2020 +#Netherlands/NoordBrabant_66/2020|EPI_ISL_415522|2020 +#Netherlands/NoordBrabant_67/2020|EPI_ISL_415523|2020 +#Spain/Cataluna201396/2020|EPI_ISL_418250|2020 +#USA/CA-CDPH-UC1/2020|EPI_ISL_413557|2020 +#Czech_Republic/ChVir1630/2020|EPI_ISL_416742|2020-02 +#Czech_Republic/ChVir1912/2020|EPI_ISL_416743|2020-03 +#Lithuania/ChVir1632/2020|EPI_ISL_416741|2020-02 + +# withdrawn (yes, not in gisaid dump 2020-03-24)? +China/Spain-cluster-case2/2020|EPI_ISL_415046|2020-03-11 +China/Spain-cluster-case3/2020|EPI_ISL_415047|2020-03-11 +China/Spain-cluster-case1/2020|EPI_ISL_415045|2020-03-11 + +# environmental samples +Wuhan/IVDC-HB-envF13-20/2020|EPI_ISL_408514|2020-01-01 +Wuhan/IVDC-HB-envF13-21/2020|EPI_ISL_408515|2020-01-01 + +# pangolin/bat +pangolin/Guangdong/P2S/2019|EPI_ISL_410544|2019 +pangolin/Guangxi/P1E/2017|EPI_ISL_410539|2017 +pangolin/Guangxi/P4L/2017|EPI_ISL_410538|2017 +pangolin/Guangxi/P3B/2017|EPI_ISL_410543|2017 +pangolin/Guangxi/P2V/2017|EPI_ISL_410542|2017 +pangolin/Guangxi/P5E/2017|EPI_ISL_410541|2017 +pangolin/Guangxi/P5L/2017|EPI_ISL_410540|2017 +pangolin/Guangdong/1/2019|EPI_ISL_410721|2019 +bat/Yunnan/RaTG13/2013|EPI_ISL_402131|2013-07-24 +pangolin/China/MP789/2019|EPI_ISL_412860|2019-03-19 + +# washington state oversampling +USA/WA-NH20/2020|EPI_ISL_418787||USA|Washington||2020-03-13 +USA/WA-S79/2020|EPI_ISL_417132||USA|Washington|King_County|2020-03-05 +USA/WA4-UW2/2020|EPI_ISL_413455||USA|Washington||2020-02-28 +USA/WA-S122/2020|EPI_ISL_417175||USA|Washington|Grant_County|2020-03-02 +USA/WA-UW150/2020|EPI_ISL_416688||USA|Washington||2020-03-14 +USA/WA-UW376/2020|EPI_ISL_418943||USA|Washington||2020-03-17 +USA/WA-UW18/2020|EPI_ISL_414366||USA|Washington||2020-03-05 +USA/UNKNOWN-UW276/2020|EPI_ISL_418061||USA|||2020-03-13 +USA/WA-UW364/2020|EPI_ISL_418931||USA|Washington||2020-03-17 +USA/WA-UW103/2020|EPI_ISL_416641||USA|Washington||2020-03-11 +USA/WA-S24/2020|EPI_ISL_417077||USA|Washington|Snohomish_County|2020-03-02 +USA/WA-S6/2020|EPI_ISL_416461||USA|Washington|King_County|2020-02-29 +USA/WA-NH23/2020|EPI_ISL_418790||USA|Washington||2020-03-13 +USA/WA-UW48/2020|EPI_ISL_415613||USA|Washington||2020-03-09 +USA/WA-UW274/2020|EPI_ISL_418059||USA|||2020-03-13 +USA/WA-S108/2020|EPI_ISL_417161||USA|Washington|King_County|2020-02-29 +USA/WA-UW86/2020|EPI_ISL_416442||USA|Washington||2020-03-10 +USA/WA-S99/2020|EPI_ISL_417152||USA|Washington|King_County|2020-02-28 +USA/WA-UW89/2020|EPI_ISL_416445||USA|Washington||2020-03-10 +USA/WA-S21/2020|EPI_ISL_417074||USA|Washington|King_County|2020-03-02 +USA/WA-S61/2020|EPI_ISL_417114||USA|Washington|King_County|2020-03-05 +USA/WA-UW63/2020|EPI_ISL_415591||USA|Washington||2020-03-10 +USA/OR-UW54/2020|EPI_ISL_415619||USA|Oregon||2020-03-09 +USA/WA-NH19/2020|EPI_ISL_418786||USA|Washington||2020-03-13 +USA/WA-S13/2020|EPI_ISL_417066||USA|Washington|King_County|2020-03-03 +USA/WA-S10/2020|EPI_ISL_416465||USA|Washington|King_County|2020-02-29 +Canada/BC_3989992/2020|EPI_ISL_418823||Canada|British_Columbia||2020-03-09 +USA/WA-UW23/2020|EPI_ISL_414592||USA|Washington|Tacoma|2020-03-06 +USA/WA-S45/2020|EPI_ISL_417098||USA|Washington||2020-02-29 +USA/WA6-UW3/2020|EPI_ISL_413457||USA|Washington||2020-02-29 +USA/WA-S38/2020|EPI_ISL_417091||USA|Washington|Snohomish_County|2020-03-04 +USA/WA-NH22/2020|EPI_ISL_418789||USA|Washington||2020-03-13 +USA/WA-S2/2020|EPI_ISL_413456||USA|Washington|King_County|2020-02-20 +USA/WA-UW84/2020|EPI_ISL_416440||USA|Washington||2020-03-10 +USA/WA-UW51/2020|EPI_ISL_415616||USA|Washington||2020-03-08 +USA/WA-S3/2020|EPI_ISL_413560||USA|Washington||2020-02-28 +USA/WI-76/2020|EPI_ISL_421334||USA|Wisconsin||2020-03-22 +USA/WA-S82/2020|EPI_ISL_417135||USA|Washington|King_County|2020-02-22 +USA/WA-UW262/2020|EPI_ISL_418047||USA|Washington||2020-03-16 +USA/WA-S88/2020|EPI_ISL_417141||USA|Washington|King_County|2020-03-01 +USA/WA-UW100/2020|EPI_ISL_416638||USA|Washington||2020-03-12 +USA/WA-S65/2020|EPI_ISL_417118||USA|Washington|King_County|2020-03-03 +USA/WI-23/2020|EPI_ISL_417507||USA|Wisconsin||2020-03-17 +USA/WA-UW190/2020|EPI_ISL_416728||USA|Washington||2020-03-13 +USA/VA-DCLS-0011/2020|EPI_ISL_419263||USA|Virginia||2020-03-10 +USA/WA-NH11/2020|EPI_ISL_418780||USA|Washington||2020-03-13 +USA/WA-UW128/2020|EPI_ISL_416666||USA|Washington||2020-03-12 +USA/WA-S52/2020|EPI_ISL_417105||USA|Washington|Snohomish_County|2020-03-03 +USA/WA-NH6/2020|EPI_ISL_418775||USA|Washington||2020-03-13 +USA/WA-S118/2020|EPI_ISL_417171||USA|Washington|King_County|2020-03-01 +USA/WA-S51/2020|EPI_ISL_417104||USA|Washington||2020-03-03 +USA/WA-S98/2020|EPI_ISL_417151||USA|Washington|King_County|2020-02-29 +USA/WA-S63/2020|EPI_ISL_417116||USA|Washington|King_County|2020-03-04 +USA/WA-UW147/2020|EPI_ISL_416685||USA|Washington||2020-03-15 +USA/WA-S39/2020|EPI_ISL_417092||USA|Washington|Snohomish_County|2020-03-04 +USA/WA-UW52/2020|EPI_ISL_415617||USA|Washington||2020-03-09 +USA/WA-S29/2020|EPI_ISL_417082||USA|Washington|Grant_County|2020-03-02 +USA/WA-S26/2020|EPI_ISL_417079||USA|Washington|Snohomish_County|2020-03-02 +USA/UPHL-04/2020|EPI_ISL_415542||USA|Utah||2020-03-13 +USA/WA18-UW14/2020|EPI_ISL_413653||USA|Washington||2020-03-05 +USA/WA-UW177/2020|EPI_ISL_416715||USA|Washington||2020-03-15 +USA/WA-S36/2020|EPI_ISL_417089||USA|Washington||2020-03-02 +USA/WA-S84/2020|EPI_ISL_417137||USA|Washington|King_County|2020-02-21 +USA/WA-S16/2020|EPI_ISL_417069||USA|Washington||2020-03-03 +USA/WA-UW285/2020|EPI_ISL_418070||USA|Washington||2020-03-14 +USA/WA-S93/2020|EPI_ISL_417146||USA|Washington|King_County|2020-02-29 +USA/WA-UW369/2020|EPI_ISL_418936||USA|Washington||2020-03-17 +USA/WA-S46/2020|EPI_ISL_417099||USA|Washington||2020-02-29 +USA/WA-UW118/2020|EPI_ISL_416656||USA|Washington||2020-03-11 +USA/WA-S53/2020|EPI_ISL_417106||USA|Washington|Snohomish_County|2020-03-03 +USA/WA-S71/2020|EPI_ISL_417124||USA|Washington|King_County|2020-03-05 +USA/WA-UW167/2020|EPI_ISL_416705||USA|Washington||2020-03-13 +USA/WA-UW33/2020|EPI_ISL_414620||USA|||2020-03-08 +USA/VA-DCLS-0019/2020|EPI_ISL_420029||USA|Virginia||2020-03-11 +USA/VA-DCLS-0018/2020|EPI_ISL_420028||USA|Virginia||2020-03-11 +USA/WA-UW179/2020|EPI_ISL_416717||USA|Washington||2020-03-15 +USA/WA-NH7/2020|EPI_ISL_418776||USA|Washington||2020-03-13 +USA/WA-S75/2020|EPI_ISL_417128||USA|Washington|King_County|2020-03-05 +USA/WA-UW95/2020|EPI_ISL_416451||USA|Washington||2020-03-10 +USA/WA-S57/2020|EPI_ISL_417110||USA|Washington||2020-03-03 +USA/WA-S117/2020|EPI_ISL_417170||USA|Washington||2020-03-02 +USA/WA-UW298/2020|EPI_ISL_418866||USA|Washington||2020-03-13 +USA/WA-S92/2020|EPI_ISL_417145||USA|Washington|King_County|2020-02-29 +USA/WA-S59/2020|EPI_ISL_417112||USA|Washington|King_County|2020-03-02 +USA/WA-S12/2020|EPI_ISL_417065||USA|Washington||2020-03-03 +USA/WA-UW183/2020|EPI_ISL_416721||USA|Washington||2020-03-13 +USA/WA-S74/2020|EPI_ISL_417127||USA|Washington|King_County|2020-03-05 +USA/WA-S15/2020|EPI_ISL_417068||USA|Washington|King_County|2020-03-02 +USA/WA-S43/2020|EPI_ISL_417096||USA|Washington||2020-02-27 +USA/WA-UW70/2020|EPI_ISL_415598||USA|Washington||2020-03-10 +USA/WA-UW99/2020|EPI_ISL_416637||USA|Washington||2020-03-12 +USA/WA-S66/2020|EPI_ISL_417119||USA|Washington|Snohomish_County|2020-03-06 +USA/WA-UW222/2020|EPI_ISL_417371||USA|Washington||2020-03-13 +USA/WA-UW343/2020|EPI_ISL_418910||USA|Washington||2020-03-16 +USA/WA-S111/2020|EPI_ISL_417164||USA|Washington|King_County|2020-03-07 +USA/WA-S102/2020|EPI_ISL_417155||USA|Washington|King_County|2020-02-28 +USA/WA-NH14/2020|EPI_ISL_418783||USA|Washington||2020-03-13 +USA/WA-NH21/2020|EPI_ISL_418788||USA|Washington||2020-03-13 +USA/WA-UW119/2020|EPI_ISL_416657||USA|Washington||2020-03-11 +USA/WA-S91/2020|EPI_ISL_417144||USA|Washington|Snohomish_County|2020-03-02 +USA/WA-S103/2020|EPI_ISL_417156||USA|Washington|King_County|2020-02-28 +USA/WA-UW345/2020|EPI_ISL_418912||USA|Washington||2020-03-16 +USA/WA-UW49/2020|EPI_ISL_415614||USA|Washington||2020-03-09 +USA/WA-UW300/2020|EPI_ISL_418868||USA|Washington||2020-03-13 +USA/WA-UW121/2020|EPI_ISL_416659||USA|Washington||2020-03-11 +USA/WA-UW105/2020|EPI_ISL_416643||USA|Washington||2020-03-11 +USA/WA-S72/2020|EPI_ISL_417125||USA|Washington|King_County|2020-03-06 +USA/WA-UW124/2020|EPI_ISL_416662||USA|Washington||2020-03-12 +USA/VA-DCLS-0017/2020|EPI_ISL_419711||Virginia|||2020-03-11 +USA/WA-UW125/2020|EPI_ISL_416663||USA|Washington||2020-03-12 +USA/WA-S69/2020|EPI_ISL_417122||USA|Washington|King_County|2020-03-05 +USA/WA-UW24/2020|EPI_ISL_414593||USA|Washington|Kirkland|2020-03-05 +USA/WA-UW77/2020|EPI_ISL_416433||USA|Washington||2020-03-10 +USA/WA-NH24/2020|EPI_ISL_418791||USA|Washington||2020-03-13 +USA/WA-S5/2020|EPI_ISL_416460||USA|Washington|King_County|2020-02-29 +USA/WA-S100/2020|EPI_ISL_417153||USA|Washington|King_County|2020-02-29 +USA/WA-UW92/2020|EPI_ISL_416448||USA|Washington||2020-03-11 +USA/WA-UW152/2020|EPI_ISL_416690||USA|Washington||2020-03-13 +USA/WA-UW209/2020|EPI_ISL_417358||USA|Washington||2020-03-13 +USA/WA-S105/2020|EPI_ISL_417158||USA|Washington|King_County|2020-02-28 +USA/WA-S44/2020|EPI_ISL_417097||USA|Washington||2020-02-28 +USA/WA-S101/2020|EPI_ISL_417154||USA|Washington|King_County|2020-02-28 +USA/WA-UW67/2020|EPI_ISL_415595||USA|Washington||2020-03-09 +USA/WA-S109/2020|EPI_ISL_417162||USA|Washington|Snohomish_County|2020-03-01 +USA/WA-UW299/2020|EPI_ISL_418867||USA|Washington||2020-03-13 +USA/UPHL-03/2020|EPI_ISL_415541||USA|Utah||2020-03-13 +USA/WA-S116/2020|EPI_ISL_417169||USA|Washington||2020-03-02 +USA/WA-UW20/2020|EPI_ISL_414368||USA|Washington||2020-03-05 + +#cluster of mutations +Luxembourg/LNS8746229/2020 +USA/UN-UW-1402/2020 +USA/WA-UW-4118/2020 +USA/UN-UW-1486/2020 +Beijing/IVDC-BJ-005/2020 +Ecuador/HGSQ-USFQ-007/2020 +Ecuador/HGSQ-USFQ-010/2020 +USA/WI-GMF-00384/2020 +USA/WA-UW-4130/2020 +USA/WI-GMF-00928/2020 +USA/WA-UW-1572/2020 +USA/ID-UW-4100/2020 +USA/WA-UW-2105/2020 +Scotland/EDB146/2020 + +#ambiguous B mutation T8783C +Australia/VIC721/2020 +Australia/WA04/2020 +USA/IL1/2020 +USA/WA-S25/2020 +USA/WA-S35/2020 +USA/WA-S56/2020 + +#ambiguous B mutation C28144T +USA/WA-S85/2020 +USA/WI-47/2020 + +#reverted B mutation C28144T +India/763/2020 +India/770/2020 +Japan/DP0690/2020 + +#reverted B.1 mutation A23404G? +Scotland/EDB023/2020 + +#3rd base for B.1 mutation C3038T +Australia/NSW22/2020 + +# possible duplicate entry +EPI_ISL_437437 + +# had ambiguous secondary alignment after mapping +Saudi_Arabia/KAUST-Makkah155/2020 + +# outlier on tree +South_Africa/R05475/2020 + +# date before its time +India/NCDC-3175/2020 +Taiwan/CGMH-CGU-22/2020 +Taiwan/CGMH-CGU-23/2020 +Taiwan/CGMH-CGU-24/2020 +Taiwan/CGMH-CGU-25/2020 + +#lots of mutations +Indonesia/EJ-ITD853Sp/2020 +Taiwan/TSGH-20/2020 +South_Africa/R02606/2020 +France/OCC-15/2020 +USA/WA-UW-4749/2020 +India/NCDC-3985/2020 diff --git a/workflows/resources/publish_cog_global_recipes.json b/workflows/resources/publish_cog_global_recipes.json new file mode 100644 index 0000000..a5a63d2 --- /dev/null +++ b/workflows/resources/publish_cog_global_recipes.json @@ -0,0 +1,105 @@ +{ + "alignments": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned", + "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + }, + { + "suffix": "all", + "data": "cog", + "fasta": "aligned" + }, + { + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" + } + ], + "metadata": [ + { + "suffix": "public", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" + }, + { + "suffix": "consortium", + "data": "cog_global", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", + "mutations": true + }, + { + "suffix": "geography", + "data": "cog_global", + "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], + "where": "cog_id=central_sample_id" + }, + { + "suffix": "mutations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "mutations": true + }, + { + "suffix": "constellations", + "data": "cog_global", + "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], + "constellations": true + }, + { + "suffix": "unlinked", + "data": "cog_global", + "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","usher_lineage", "usher_lineages_version", "is_surveillance", "collection_pillar", "is_pillar_2"], + "mutations": true, + "uk_only": true, + "shuffle": true, + "drop_index": "sequence_name" + }, + { + "data": "cog_global", + "suffix": "epidemiology", + "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" + } + ], + "public": [ + { + "suffix": "all", + "data": "cog", + "fasta": "unaligned" + }, + { + "data": "cog", + "fasta": "trimmed", + "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], + "mutations": true, + "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" + }, + { + "suffix": "unmasked", + "data": "cog", + "fasta": "aligned" + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "cog_global", + "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" + }, + { + "suffix": "mutations", + "data": "cog_global", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/workflows/resources/publish_gisaid_recipes.json b/workflows/resources/publish_gisaid_recipes.json new file mode 100644 index 0000000..69c88da --- /dev/null +++ b/workflows/resources/publish_gisaid_recipes.json @@ -0,0 +1,56 @@ +{ + "gisaid": [ + { + "suffix": "all", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" + }, + { + "suffix": "global", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], + "mutations": true, + "exclude_cog": true, + "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" + }, + { + "suffix": "global_mutations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "mutations": true, + "exclude_cog": true + }, + { + "suffix": "global_constellations", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "constellations": true, + "exclude_cog": true + }, + { + "suffix": "global_updown", + "data": "gisaid", + "metadata_fields": ["sequence_name"], + "updown": true, + "exclude_cog": true + } + ], + "civet3": [ + { + "suffix": "private", + "fasta": "gisaid", + "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], + "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" + }, + { + "suffix": "mutations", + "data": "gisaid", + "updown": true, + "metadata_fields": ["sequence_name", "query"], + "where": "query=sequence_name", + "drop_index": "sequence_name" + } + ] +} diff --git a/workflows/resources/publish_readme.txt b/workflows/resources/publish_readme.txt new file mode 100644 index 0000000..ec36438 --- /dev/null +++ b/workflows/resources/publish_readme.txt @@ -0,0 +1,36 @@ +# Summary of published datapipe outputs + +### Alignments + +- `cog__all.fa` : all unaligned sequences after deduplication +- `cog__all_alignment.fa` : all aligned sequences after deduplication +- `cog__all_metadata.csv` : all corresponding metadata +- `cog__alignment.fa` : filtered, trimmed alignment with sequences matching those in the corresponding metadata +- `cog__metadata.csv` : corresponding metadata for filtered, trimmed alignment + +### Cog + +- `cog.insertions.tsv` and `cog.deletions.tsv` containing all found insertions and deletions for the UK sequences +- `UTLA_genome_counts_.csv` containing counts of delta sequences by date and UTLA + +### Metadata + +- `cog_global__geography.csv` : metadata containing the following columns `"central_sample_id","sequence_name","sample_date","epi_week","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location"` +- `cog_global__mutations.csv` : metadata containing the following columns `"sequence_name", "sample_date", "lineage","lineages_version"` and additionally columns for specifically typed mutations of interest +- `cog_global__public.csv` : metadata containing the following columns `"sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version"` +- `cog_global__consortium.csv` : metadata containing all columns as in the public metadata, extended with the following columns `"received_date","collection_date","published_date","sequencing_org_code","submission_org_code","submission_user","root_sample_id","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target"` +- `cog__unlinked.csv` : shuffled metadata with no ids containing the following columns `"safe_sample_date","epi_week", "location","lineage","lineages_version","is_surveillance", "collection_pillar", "is_pillar_2"` +- `cog_global__epidemiology.csv` : metadata containing the following columns `"sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date","received_date","sequencing_submission_date","sequencing_org_code","root_sample_id","biosample_source_id","country","adm1","adm2","utla","utla_code","outer_postcode","NUTS1","latitude","longitude","location","source_age","source_sex","collection_pillar","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineage_support","lineages_version","scorpio_call","scorpio_support","ambiguity_count"` + +### Public + +- `cog__all.fa` : all unaligned sequences after deduplication +- `cog__unmasked_alignment.fa` : all aligned sequences +- `cog__alignment.fa` : filtered, trimmed alignment with sequences matching those in the corresponding metadata +- `cog__metadata.csv` : corresponding metadata for filtered, trimmed alignment with the following columns `"sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version"` + +### Civet3 +- `cog_global__private_alignment.fa` : masked, trimmed, filtered alignment of COG and GLOBAL sequences +- `cog_global__private_metadata.csv` : corresponding metadata with the following columns `"sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","scorpio_call"` +- `cog_global__mutations.csv` : metadata file produced by gofasta updown list, providing information about nucleotide mutations and ambiguous regions in aligned sequences + diff --git a/workflows/resources/resequencing_omissions.txt b/workflows/resources/resequencing_omissions.txt new file mode 100644 index 0000000..cd52f25 --- /dev/null +++ b/workflows/resources/resequencing_omissions.txt @@ -0,0 +1,15 @@ +# resequencing +England/PHEC-20170/2020 + +England/PHEC-1AFD9/2020 +England/PHEC-1AFF7/2020 +England/PHEC-1B002/2020 +England/PHEC-1B011/2020 +England/PHEC-1B020/2020 +England/PHEC-1B03F/2020 +England/PHEC-1B04E/2020 +England/PHEC-1B05D/2020 +England/PHEC-1B06C/2020 +England/PHEC-1B07B/2020 +England/PHEC-1B08A/2020 +England/PHEC-1B099/2020 From 1575311af1a0d9874f90b6028ffec21214a4823c Mon Sep 17 00:00:00 2001 From: whalleyt Date: Mon, 6 Jan 2025 15:18:07 +0000 Subject: [PATCH 3/4] rm old files --- workflows/LICENSE.txt | 674 --------------- workflows/README.md | 63 -- workflows/bin/add_to_uk_metadata.py | 225 ----- ...otate_with_unmapped_genome_completeness.py | 53 -- workflows/bin/cache_pangolin_report.py | 60 -- workflows/bin/geography_cleaning | 1 - workflows/bin/prepare_for_pangolin.py | 131 --- workflows/bin/publish_from_config.py | 244 ------ workflows/bin/remove_duplicates_by_date.py | 85 -- workflows/bin/summarise_genomes_by_utla.py | 90 -- workflows/bin/type_aas_and_dels.py | 118 --- .../uk_label_sourceid_duplicates_to_omit.py | 72 -- ..._remove_duplicates_COGID_by_proportionN.py | 73 -- workflows/config/base.config | 64 -- workflows/environment.yml | 32 - workflows/environment.yml.old | 32 - workflows/future_improvements | 21 - workflows/modules/align_and_variant_call.nf | 560 ------------ workflows/modules/clean_geography.nf | 288 ------- workflows/modules/deduplicate.nf | 208 ----- workflows/modules/filter_and_trim.nf | 242 ------ workflows/modules/pangolin.nf | 342 -------- workflows/modules/preprocess_cog_uk.nf | 372 -------- workflows/modules/preprocess_gisaid.nf | 96 --- workflows/modules/publish_all.nf | 427 ---------- workflows/modules/start.nf | 33 - workflows/nextflow.config | 32 - workflows/resources/AAs.csv | 10 - workflows/resources/MN908947.fa | 429 ---------- workflows/resources/MN908947.gb | 798 ------------------ workflows/resources/README | 30 - workflows/resources/WH04.fa | 2 - workflows/resources/date_corrections.csv | 11 - workflows/resources/dels.csv | 2 - workflows/resources/empty_constellations.csv | 1 - workflows/resources/empty_mutations.csv | 1 - workflows/resources/empty_updown.csv | 1 - workflows/resources/gisaid_omissions.txt | 364 -------- .../resources/publish_cog_global_recipes.json | 105 --- .../resources/publish_gisaid_recipes.json | 56 -- workflows/resources/publish_readme.txt | 36 - .../resources/resequencing_omissions.txt | 15 - 42 files changed, 6499 deletions(-) delete mode 100644 workflows/LICENSE.txt delete mode 100644 workflows/README.md delete mode 100755 workflows/bin/add_to_uk_metadata.py delete mode 100755 workflows/bin/annotate_with_unmapped_genome_completeness.py delete mode 100755 workflows/bin/cache_pangolin_report.py delete mode 160000 workflows/bin/geography_cleaning delete mode 100755 workflows/bin/prepare_for_pangolin.py delete mode 100755 workflows/bin/publish_from_config.py delete mode 100755 workflows/bin/remove_duplicates_by_date.py delete mode 100755 workflows/bin/summarise_genomes_by_utla.py delete mode 100755 workflows/bin/type_aas_and_dels.py delete mode 100755 workflows/bin/uk_label_sourceid_duplicates_to_omit.py delete mode 100755 workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py delete mode 100644 workflows/config/base.config delete mode 100644 workflows/environment.yml delete mode 100644 workflows/environment.yml.old delete mode 100644 workflows/future_improvements delete mode 100644 workflows/modules/align_and_variant_call.nf delete mode 100644 workflows/modules/clean_geography.nf delete mode 100644 workflows/modules/deduplicate.nf delete mode 100644 workflows/modules/filter_and_trim.nf delete mode 100644 workflows/modules/pangolin.nf delete mode 100644 workflows/modules/preprocess_cog_uk.nf delete mode 100644 workflows/modules/preprocess_gisaid.nf delete mode 100644 workflows/modules/publish_all.nf delete mode 100644 workflows/modules/start.nf delete mode 100644 workflows/nextflow.config delete mode 100644 workflows/resources/AAs.csv delete mode 100644 workflows/resources/MN908947.fa delete mode 100644 workflows/resources/MN908947.gb delete mode 100644 workflows/resources/README delete mode 100644 workflows/resources/WH04.fa delete mode 100644 workflows/resources/date_corrections.csv delete mode 100644 workflows/resources/dels.csv delete mode 100644 workflows/resources/empty_constellations.csv delete mode 100644 workflows/resources/empty_mutations.csv delete mode 100644 workflows/resources/empty_updown.csv delete mode 100644 workflows/resources/gisaid_omissions.txt delete mode 100644 workflows/resources/publish_cog_global_recipes.json delete mode 100644 workflows/resources/publish_gisaid_recipes.json delete mode 100644 workflows/resources/publish_readme.txt delete mode 100644 workflows/resources/resequencing_omissions.txt diff --git a/workflows/LICENSE.txt b/workflows/LICENSE.txt deleted file mode 100644 index f288702..0000000 --- a/workflows/LICENSE.txt +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/workflows/README.md b/workflows/README.md deleted file mode 100644 index e40ef34..0000000 --- a/workflows/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# Datapipe - -Pipeline to process SARS-CoV-2 sequences and metadata, clean up irregularities, align and variant call then publish matched subsets of FASTA sequences and metadata for groups with different access to sensitive data. - -Runs weekly on global sequences downloaded from GISAID. - -Runs daily on COG-UK sequences, and combines with non-UK GISAID sequences. - -### Install and run - git clone --recurse-submodules https://github.com/COG-UK/grapevine_nextflow.git - cd grapevine_nextflow - conda env create -f environment.yml - conda activate grapevine_nextflow - - NXF_VER=20.10.0 nextflow run workflows/process_cog_uk.nf - -### Pipeline Overview - -#### GISAID processing - -1. Parse GISAID dump (`export.json`) and extract FASTA of sequences and associated metadata. - - - Excludes known problematic sequences listed in `gisaid_omissions.txt` - - - Excludes sequences where `covv_host.lower() != 'human'` - - Excludes sequences where malformed (not `YYYY-MM-DD`) or impossible (earlier than `2019-11-30` or later than today) date in `covv_collection_date` - - Reformat FASTA header - - Add `epi-week` and `epi-day` columns to metadata - -2. Run `pangolin` (https://github.com/cov-lineages/pangolin) on all new sequences. If new release of `pangolin` run on all sequences. -3. Calculate the `unmapped_genome_completeness` as the proportion of sequence length which is unambiguous (not `N`) -4. Deduplicate by date, keeping the earliest example -5. Align to the reference (`Wuhan/WH04/2020`) with `minimap2` -6. Variant call using `gofasta` and type specific mutations of interest listed in `AAs.csv` and `dels.csv` -7. Filter out low quality sequences with mapped completeness < 93%, and trim and pad alignment outside of reference coordinates `265:29674` -8. Calculate distance to reference and exclude sequences with distance to more than 4.0 epi-week std devs. - -#### COG-UK processing - -1. Parse matched FASTA and metadata TSV output by Elan/Majora - - - Reformats header and unaligns sequences which have already been aligned to the reference - - - Manual date correction for samples listed in `date_corrections.csv` - - Excludes early sequences which have been resequenced as listed in `resequencing_omissions.txt` - - Adds GISAID accession if recently submitted - - - Excludes sequences where malformed (not `YYYY-MM-DD`) or impossible (earlier than `2019-11-30` or later than today) date in `covv_collection_date` - - Add `epi-week` and `epi-day`, `source_id` and `pillar_2` columns to metadata - -2. Run `pangolin` (https://github.com/cov-lineages/pangolin) on all new sequences. If new release of `pangolin` run on all sequences. -3. Calculate the `unmapped_genome_completeness` as the proportion of sequence length which is unambiguous (not `N`) -4. Deduplicate COG-ID by completeness and label samples with duplicate `source_id` -5. Align to the reference (`Wuhan/WH04/2020`) with `minimap2` -6. Variant call using `gofasta` and type specific mutations of interest listed in `AAs.csv` and `dels.csv` -7. Filter out low quality sequences with mapped completeness < 93%, and trim and pad alignment outside of reference coordinates `265:29674` -8. Clean up geographical metadata (https://github.com/COG-UK/geography_cleaning) -9. Combine COG-UK sequences and metadata with non-UK GISAID sequences and metadata -10. Publish subsets of the data as described in `publish_cog_global_recipes.json` - -### What is grapevine? - -`grapevine` (https://github.com/COG-UK/grapevine) was the name of the original pipeline which did all of the above, made phylogenetic trees and more. As the number of sequences has grown the tree building steps take increasingly long to complete. As the majority of users only interact with the alignments and cleaned metadata, it was decided that a robust implementation of the alignment and metadata processing steps run daily would be more useful and that is what is provided here. diff --git a/workflows/bin/add_to_uk_metadata.py b/workflows/bin/add_to_uk_metadata.py deleted file mode 100755 index e98f7a4..0000000 --- a/workflows/bin/add_to_uk_metadata.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -import csv -from itertools import chain -from epiweeks import Week,Year -from datetime import datetime - -adm1a_to_country = {"UK-SCT": "Scotland", - "UK-WLS": "Wales", - "UK-ENG": "England", - "UK-NIR": "Northern_Ireland", - "FK": "Falkland_Islands", - "GI": "Gibraltar", - "JE": "Jersey", - "IM": "Isle_of_Man", - "GG": "Guernsey" - } - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - parser.add_argument('--accession-file', dest = 'accession_file', required=False, help='TSV of accession') - parser.add_argument('--updated-date-file', dest = 'updated_date_file', required=False, help='CSV of date corrections') - parser.add_argument('--log-file', dest = 'log_file', required=False, help='Log file') - - args = parser.parse_args() - - return args - -def load_updated_dates(updated_date_file): - date_dict = {} - if updated_date_file: - with open(updated_date_file, 'r', newline = '') as dates_in: - reader = csv.DictReader(dates_in, delimiter=",", quotechar='\"', dialect = "unix") - for row in reader: - date_dict[row["central_sample_id"]] = row["sample_date"] - return date_dict - -def add_sample_date(row, date_dict): - if row["central_sample_id"] in date_dict: - row["sample_date"] = date_dict[row["central_sample_id"]] - return - try: - date = datetime.strptime(row["collection_date"], '%Y-%m-%d').date() - row["sample_date"] = row["collection_date"] - except: - try: - date = datetime.strptime(row["received_date"], '%Y-%m-%d').date() - row["sample_date"] = row["received_date"] - except: - row["sample_date"] = "" - -def add_source_id(row): - row["source_id"] = row["biosample_source_id"] - if row["root_biosample_source_id"] not in [None,""]: - row["source_id"] = row["root_biosample_source_id"] - if len(row["source_id"]) < 3: - row["source_id"] = None - -def add_pillar_2(row): - if row['collection_pillar'] in [2,"2"] or row['central_sample_id'][0:4] in ["ALDP", "CAMC", "MILK", "QEUH","RAND"]: - row["is_pillar_2"] = "Y" - else: - row["is_pillar_2"] = "N" - -def add_sequence_name(row): - country = adm1a_to_country[row['adm1']] - id = row['central_sample_id'] - year = str(row['sample_date']).split("-")[0] - name = country + "/" + id + "/" + year - - row["sequence_name"] = name - -def load_accession(accession_file, log_handle): - if not accession_file: - return {} - - accession_dict = {} - - with open(str(accession_file), 'r', newline = '') as acc_in: - reader = csv.DictReader(acc_in, delimiter="\t", quotechar='\"', dialect = "unix") - for row in reader: - central_sample_id = row["central_sample_id"] - run_name = row["run_name"] - gisaid_accession = row["gisaid.accession"] - - if central_sample_id in accession_dict: - if run_name in accession_dict[central_sample_id]: - log_handle.write(f'duplicate central_sample_id * run_name in accession list: {central_sample_id} {run_name}\n') - continue - accession_dict[central_sample_id][run_name] = gisaid_accession - else: - accession_dict[central_sample_id] = {run_name: gisaid_accession} - return accession_dict - -def add_covv_accession_id(row, accession_dict): - acc = "" - if row["central_sample_id"] in accession_dict: - if row["run_name"] in accession_dict[row["central_sample_id"]]: - acc = accession_dict[row["central_sample_id"]][row["run_name"]] - - row["covv_accession_id"] = acc - -def date_string_to_epi_week(date_string): - """ - parse a date string in YYYY-MM-DD format and return - cumulative epi week which is cumulative total epidemiological - weeks since 2019-12-22. Week beginning 2019-12-22 is week 0 - """ - try: - date = datetime.strptime(date_string, '%Y-%m-%d').date() - except: - return "" - # this is epi-week: - week = Week.fromdate(date) - if week.year < 2019 or (week.year == 2019 and week.week < 52): - return "" - elif week.year == 2019: - return("0") - else: - cum_epi_week = week.week + len(list(chain(*[[x for x in Year(y).iterweeks()] for y in range(2020, week.year)]))) - return str(cum_epi_week) - -def date_string_to_epi_day(date_string): - """ - parse a date string in YYYY-MM-DD format and return - cumulative epi day which is cumulative total days since 2019-12-22 - """ - try: - date = datetime.strptime(date_string, '%Y-%m-%d').date() - except: - return "" - # this is epi-week week: - week = Week.fromdate(date) - # this is day 1 of epi-week 0: - day_one = datetime.strptime("2019-12-22", '%Y-%m-%d').date() - if week.year < 2019 or (week.year == 2019 and week.week < 52): - return "" - else: - cum_epi_day = (date - day_one).days + 1 - return str(cum_epi_day) - -def date_string_to_safe_date_string(date_string): - """ - parse a date string in YYYY-MM-DD format and return - date corresponding to the start of the epi-week in which it falls. - Week beginning 2019-12-22 is week 0 - """ - try: - date = datetime.strptime(date_string, '%Y-%m-%d').date() - except: - return "" - # this is epi-week: - week = Week.fromdate(date) - - if week.year < 2019 or (week.year == 2019 and week.week < 52): - return "" - else: - return week.startdate().strftime('%Y-%m-%d') - -def add_epi_week_and_day(row): - date_str = row["sample_date"] - epi_week = date_string_to_epi_week(date_str) - epi_day = date_string_to_epi_day(date_str) - safe_date = date_string_to_safe_date_string(date_str) - - row["edin_epi_week"] = epi_week - row["edin_epi_day"] = epi_day - row["safe_sample_date"] = safe_date - -def United_Kingdom_to_UK(row): - row["adm0"] = row["adm0"].replace("United Kingdom", "UK") - -def add_uk_columns(row): - row["is_cog_uk"] = "Y" - country = adm1a_to_country[row['adm1']] - if country in ['England', 'Scotland', 'Wales', 'Northern_Ireland']: - row["is_uk"] = "Y" - else: - row["is_uk"] = "N" - -def main(): - args = parse_args() - if args.log_file: - log_handle = open(args.log_file, 'w') - else: - log_handle = sys.stdout - - date_dict = load_updated_dates(args.updated_date_file) - accession_dict = load_accession(args.accession_file, log_handle) - new_columns = ["sample_date", "source_id", "is_pillar_2", "sequence_name", "covv_accession_id", "edin_epi_week", "edin_epi_day", "safe_sample_date", "is_uk", "is_cog_uk", "why_excluded"] - - with open(args.in_metadata, 'r', newline = '') as csv_in, \ - open(args.out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter="\t", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + new_columns, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - try: - add_sample_date(row, date_dict) - add_source_id(row) - add_pillar_2(row) - add_sequence_name(row) - add_covv_accession_id(row, accession_dict) - add_epi_week_and_day(row) - United_Kingdom_to_UK(row) - row["why_excluded"] = "" - add_uk_columns(row) - writer.writerow(row) - except: - log_handle.write(f"Error updating metadata for row") - log_handle.write(str(row)) - sys.exit("Could not update metadata for row, check metadata fields") - - - log_handle.close() - -if __name__ == '__main__': - main() diff --git a/workflows/bin/annotate_with_unmapped_genome_completeness.py b/workflows/bin/annotate_with_unmapped_genome_completeness.py deleted file mode 100755 index 3ebeac8..0000000 --- a/workflows/bin/annotate_with_unmapped_genome_completeness.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from Bio import SeqIO -import csv - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') - - args = parser.parse_args() - - return args - -def run(in_fasta, in_metadata, out_metadata): - alignment = SeqIO.index(in_fasta, "fasta") - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["unmapped_genome_completeness"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - id_key = "fasta_header" - if "edin_header" in reader.fieldnames: - id_key = "edin_header" - - for row in reader: - id = row[id_key] - if id in alignment: - seq = str(alignment[id].seq) - if len(seq) == 0: - print(id) - row["unmapped_genome_completeness"] = 0.0 - else: - completeness = float(len(seq.replace("N", "")) / len(seq)) - row["unmapped_genome_completeness"] = completeness - writer.writerow(row) - else: - row["unmapped_genome_completeness"] = 0.0 - writer.writerow(row) - -def main(): - args = parse_args() - run(args.in_fasta, args.in_metadata, args.out_metadata) - -if __name__ == '__main__': - main() diff --git a/workflows/bin/cache_pangolin_report.py b/workflows/bin/cache_pangolin_report.py deleted file mode 100755 index a356314..0000000 --- a/workflows/bin/cache_pangolin_report.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import csv -from Bio import SeqIO -import hashlib - - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='Lineage report from pangolin') - parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='Unaligned fasta') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='Hashed lineage report from pangolin') - - args = parser.parse_args() - - return args - -def get_hash_string(record): - seq = str(record.seq).upper().encode() - hash_object = hashlib.md5(seq) - hash_string = hash_object.hexdigest() - return hash_string - -def cache_report(in_fasta, in_metadata, out_metadata): - hashed_seqs = set() - records = SeqIO.index(in_fasta, "fasta") - index_column = "taxon" - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - fieldnames = reader.fieldnames[:] - fieldnames.remove(index_column) - print(fieldnames) - writer = csv.DictWriter(csv_out, fieldnames = ["hash"] + fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - print(row) - if row[index_column] not in records: - continue - record = records[row[index_column]] - hash = get_hash_string(record) - if hash not in hashed_seqs: - del row[index_column] - row["hash"] = hash - hashed_seqs.add(hash) - writer.writerow(row) - - -def main(): - args = parse_args() - cache_report(args.in_fasta, args.in_metadata, args.out_metadata) - - -if __name__ == '__main__': - main() diff --git a/workflows/bin/geography_cleaning b/workflows/bin/geography_cleaning deleted file mode 160000 index 416df2f..0000000 --- a/workflows/bin/geography_cleaning +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 416df2f4cb1561de7f16483d17bd2990ef148ec0 diff --git a/workflows/bin/prepare_for_pangolin.py b/workflows/bin/prepare_for_pangolin.py deleted file mode 100755 index 2ffec46..0000000 --- a/workflows/bin/prepare_for_pangolin.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -from Bio import SeqIO -import csv -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description="""Split in fasta and metadata into lineageless for pangolin and those with a lineage""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-fasta', dest = 'in_fasta', required=False, default=None, help='Aligned FASTA') - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='CSV of metadata') - parser.add_argument('--previous-metadata', dest = 'previous_metadata', required=True, help='CSV of from previous run') - parser.add_argument('--out-fasta', dest = 'out_fasta', required=False, default=None, help='FASTA to write out') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV of metadata') - - args = parser.parse_args() - return args - -def prepare_for_pangolin(in_fasta, in_metadata, previous_metadata, out_fasta, out_metadata): - print(in_fasta, in_metadata, previous_metadata, out_fasta, out_metadata) - if in_fasta: - alignment = SeqIO.index(in_fasta, "fasta") - else: - alignment = None - - taxon = "taxon" - keys = {"lineage": "lineage", - "lineages_version": "version", - "lineage_conflict": "conflict", - "lineage_ambiguity_score": "ambiguity_score", - "pangolin_version": "pangolin_version", - "pangoLEARN_version": "pangoLEARN_version", - "scorpio_call":"scorpio_call", - "scorpio_support":"scorpio_support", - "scorpio_conflict":"scorpio_conflict", - "usher_lineage":"usher_lineage", - "usher_lineages_version": "usher_lineages_version"} - lineage_dict = {} - - with open(previous_metadata, 'r', newline = '') as lineages_in: - reader = csv.DictReader(lineages_in, delimiter=",", quotechar='\"', dialect = "unix") - - if "fasta_header" in reader.fieldnames: - taxon = "fasta_header" - elif "edin_header" in reader.fieldnames: - taxon = "edin_header" - elif "sequence_name" in reader.fieldnames: - taxon = "sequence_name" - - if "lineages_version" in reader.fieldnames: - keys["lineages_version"] = "lineages_version" - elif "version" in reader.fieldnames: - keys["lineages_version"] = "version" - elif "pangoLEARN_version" in reader.fieldnames: - keys["lineages_version"] = "pangoLEARN_version" - - if "lineage_conflict" in reader.fieldnames: - keys["lineage_conflict"] = "lineage_conflict" - if "lineage_ambiguity_score" in reader.fieldnames: - keys["lineage_ambiguity_score"] = "lineage_ambiguity_score" - - for row in reader: - if row[taxon] in lineage_dict: - print("%s occurs more than once in lineages input file" % row[taxon]) - continue - lineage_dict[row[taxon]] = {} - for key in keys: - value = keys[key] - if value in row: - lineage_dict[row[taxon]][key] = row[value] - - - if out_fasta: - fasta_out = open(out_fasta, 'w') - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - fieldnames = reader.fieldnames - print(fieldnames, len(fieldnames)) - if len(fieldnames) <= 1: - csv_in.close() - csv_in = open(in_metadata, 'r', newline = '') - reader = csv.DictReader(csv_in, delimiter="\t", quotechar='\"', dialect = "unix") - fieldnames = reader.fieldnames - fieldnames.extend([key for key in keys if key not in fieldnames]) - writer = csv.DictWriter(csv_out, fieldnames = fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - taxon = "taxon" - if "fasta_header" in reader.fieldnames: - taxon = "fasta_header" - elif "edin_header" in reader.fieldnames: - taxon = "edin_header" - elif "sequence_name" in reader.fieldnames: - taxon = "sequence_name" - print(taxon) - print(reader.fieldnames) - - missing_lineage = 0 - - for row in reader: - for key in keys: - if key not in row: - row[key] = None - - fasta_header = row[taxon] - - if fasta_header in lineage_dict: - row.update(lineage_dict[fasta_header]) - elif alignment and fasta_out and fasta_header in alignment: - seqrec = alignment[fasta_header] - fasta_out.write(">" + seqrec.id + "\n") - fasta_out.write(str(seqrec.seq) + "\n") - if not row["lineage"]: - missing_lineage += 1 - writer.writerow(row) - - if out_fasta: - fasta_out.close() - - with open("pango.log", "w") as f: - f.write("Number of sequences missing lineage assignments: %i" %missing_lineage) - -def main(): - args = parse_args() - print(args) - prepare_for_pangolin(args.in_fasta, args.in_metadata, args.previous_metadata, args.out_fasta, args.out_metadata) - -if __name__ == '__main__': - main() diff --git a/workflows/bin/publish_from_config.py b/workflows/bin/publish_from_config.py deleted file mode 100755 index 957fdc4..0000000 --- a/workflows/bin/publish_from_config.py +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import subprocess -import os -import sys -import glob - -class Error (Exception): pass - -def parse_args(): - parser = argparse.ArgumentParser(description="""Create published files from config file""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--unaligned_fasta', dest = 'unaligned_fasta', required=False, help='Raw FASTA') - parser.add_argument('--aligned_fasta', dest = 'aligned_fasta', required=False, help='Aligned, masked, untrimmed FASTA') - parser.add_argument('--trimmed_fasta', dest = 'trimmed_fasta', required=False, help='Aligned, masked, trimmed and filtered FASTA') - parser.add_argument('--gisaid_fasta', dest = 'global_fasta', required=False, help='GISAID aligned FASTA') - parser.add_argument('--cog_global_fasta', dest = 'cog_global_fasta', required=False, help='COG GISAID aligned FASTA') - - parser.add_argument('--cog_metadata', dest = 'cog_metadata', required=False, help='MASSIVE CSV') - parser.add_argument('--gisaid_metadata', dest = 'global_metadata', required=False, help='MASSIVE CSV') - parser.add_argument('--cog_global_metadata', dest = 'cog_global_metadata', required=False, help='MASSIVE CSV') - - parser.add_argument('--mutations', dest = 'mutations', required=False, help='Mutations CSV') - parser.add_argument('--constellations', dest = 'constellations', required=False, help='Constellations CSV') - parser.add_argument('--updown', dest = 'updown', required=False, help='Updown output CSV') - - parser.add_argument('--recipes', dest = 'recipes', required=True, help='JSON of recipes') - parser.add_argument('--date', dest = 'date', required=True, help='Datestamp for published files') - - args = parser.parse_args() - return args - -#"data": "cog", "gisaid" or "cog_global" -#"fasta": "unaligned", "aligned", "trimmed", "cog_global" or "gisaid" -#"metadata_fields": [] -#"mutations": True or False to add columns from mutations -#"constellations": True or False to add columns from constellations -#"updown": True or False to add columns from updown -#"shuffle": True to shuffle rows of metadata -#"where": free text to be passed to fastafunk fetch --where-column -#"suffix": something to append to file names -#"exclude_uk": True or False to exclude samples from UK -#"uk_only": True or False to include only samples from UK from cog_global metadata -#"drop_index": name of index column that should be dropped at the end - -def get_info_from_config(config_dict, outdir, date, fasta_dict, csv_dict, mutations_file, constellations_file, updown_file): - info_dict = {"suffix":None, "data":None, "fasta":None, "metadata_fields":None, - "where": None, "mutations":False, "constellations":False, "updown":False, - "shuffle":False, "drop_index": None, - "exclude_uk":False, "uk_only": False, "exclude_cog":False, "cog_only": False, - "date": date, - "in_fa":None, "in_csv":None, "in_muts":None, "in_con":None, "in_up": None, - "out_fa":"tmp.fa", "intermediate_csv":"tmp.csv", "out_csv":"tmp.csv"} - info_dict.update(config_dict) - - if info_dict["fasta"] in fasta_dict.keys(): - info_dict["in_fa"] = fasta_dict[info_dict["fasta"]] - elif info_dict["data"] == "cog_global": - info_dict["in_fa"] = fasta_dict["cog_global"] - elif info_dict["data"] == "gisaid": - info_dict["in_fa"] = fasta_dict["gisaid"] - elif info_dict["data"] == "cog": - info_dict["in_fa"] = fasta_dict["trimmed"] - else: - sys.exit("Config entries need to specify either fasta in ['unaligned', 'aligned', 'trimmed', 'cog_global', 'gisaid'] or data \ - in ['cog', 'cog_global', 'gisaid']") - - if info_dict["data"] is None: - if info_dict["fasta"] == "cog_global": - info_dict["data"] = "cog_global" - elif info_dict["fasta"] == "gisaid": - info_dict["data"] = "gisaid" - else: - info_dict["data"] = "cog" - - if info_dict["data"] == "cog_global": - info_dict["in_csv"] = csv_dict["cog_global"] - elif info_dict["data"] == "cog": - info_dict["in_csv"] = csv_dict["cog"] - elif info_dict["data"] == "gisaid": - info_dict["in_csv"] = csv_dict["gisaid"] - - info_dict["in_muts"] = mutations_file - info_dict["in_con"] = constellations_file - info_dict["in_up"] = updown_file - - start = "%s/%s_%s" %(outdir, info_dict["data"], info_dict["date"]) - if info_dict["suffix"]: - start += "_%s" %info_dict["suffix"] - csv_end = ".csv" - - if info_dict["fasta"]: - csv_end = "_metadata.csv" - if info_dict["fasta"]=="aligned" or (info_dict["metadata_fields"] and info_dict["fasta"]!="unaligned"): - info_dict["out_fa"] = "%s_alignment.fa" %start - else: - info_dict["out_fa"] = "%s.fa" %start - - info_dict["out_csv"] = "%s%s" %(start, csv_end) - - if info_dict["out_fa"] != "tmp.fa" and info_dict["in_fa"] is None: - sys.exit("Please provide the appropriate FASTA file") - if info_dict["metadata_fields"] is not None and info_dict["in_csv"] is None: - sys.exit("Please provide the appropriate CSV file") - if info_dict["mutations"] is not None and info_dict["in_muts"] is None: - sys.exit("Please provide the appropriate mutations file") - if info_dict["constellations"] is not None and info_dict["in_con"] is None: - sys.exit("Please provide the appropriate constellations file") - if info_dict["updown"] is not None and info_dict["in_up"] is None: - sys.exit("Please provide the appropriate updown file") - - print(info_dict) - return info_dict - -def syscall(cmd_list, allow_fail=False): - if None in cmd_list: - print('None in list', cmd_list, file=sys.stderr) - raise Error('Error in command. Cannot continue') - command = ' '.join(cmd_list) - print(command) - completed_process = subprocess.run(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) - if (not allow_fail) and completed_process.returncode != 0: - print('Error running this command:', command, file=sys.stderr) - print('Return code:', completed_process.returncode, file=sys.stderr) - print('\nOutput from stdout:', completed_process.stdout, sep='\n', file=sys.stderr) - print('\nOutput from stderr:', completed_process.stderr, sep='\n', file=sys.stderr) - raise Error('Error in system call. Cannot continue') - print(completed_process.stdout) - return completed_process - -def publish_file(outdir, info_dict): - if info_dict["metadata_fields"] is None: - cmd_list = ["cp", info_dict["in_fa"], info_dict["out_fa"]] - syscall(cmd_list) - return - - if info_dict["exclude_uk"]: - cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], - "--out-metadata tmp.no_uk.csv --column is_uk --is_true"] - syscall(cmd_list) - info_dict["in_csv"] = "tmp.no_uk.csv" - - if info_dict["exclude_cog"]: - cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], - "--out-metadata tmp.no_cog.csv --column is_cog_uk --is_true"] - syscall(cmd_list) - info_dict["in_csv"] = "tmp.no_cog.csv" - - if info_dict["uk_only"]: - cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], - "--out-metadata tmp.uk_only.csv --column is_uk --is_false"] - syscall(cmd_list) - info_dict["in_csv"] = "tmp.uk_only.csv" - - if info_dict["cog_only"]: - cmd_list = ["fastafunk filter_column --in-metadata", info_dict["in_csv"], - "--out-metadata tmp.cog_only.csv --column is_cog_uk --is_false"] - syscall(cmd_list) - info_dict["in_csv"] = "tmp.cog_only.csv" - - if info_dict["shuffle"]: - cmd_list = ["fastafunk shuffle --in-metadata", info_dict["in_csv"], "--out-metadata", "tmp.shuffled.csv"] - syscall(cmd_list) - info_dict["in_csv"] = "tmp.shuffled.csv" - - cmd_list = ["fastafunk fetch --in-fasta", info_dict["in_fa"], "--in-metadata", info_dict["in_csv"], - "--index-column sequence_name --out-fasta", info_dict["out_fa"], - "--out-metadata", info_dict["intermediate_csv"], "--restrict --low-memory"] - - if info_dict["metadata_fields"]: - if "why_excluded" in info_dict["metadata_fields"]: - cmd_list.append("--keep-omit-rows") - cmd_list.append("--filter-column") - cmd_list.extend(info_dict["metadata_fields"]) - - if info_dict["where"]: - cmd_list.append("--where-column %s" %info_dict["where"]) - syscall(cmd_list) - - if info_dict["mutations"]: - cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], - "--in-data", info_dict["in_muts"], "--index-column sequence_name", - "--join-on sequence_name --out-metadata tmp.muts.csv"] - info_dict["intermediate_csv"] = "tmp.muts.csv" - syscall(cmd_list) - - if info_dict["constellations"]: - cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], - "--in-data", info_dict["in_con"], "--index-column sequence_name", - "--join-on sequence_name --out-metadata tmp.constellations.csv"] - info_dict["intermediate_csv"] = "tmp.constellations.csv" - syscall(cmd_list) - - if info_dict["updown"]: - cmd_list = ["fastafunk add_columns --in-metadata", info_dict["intermediate_csv"], - "--in-data", info_dict["in_up"], "--index-column sequence_name", - "--join-on query --out-metadata tmp.updown.csv"] - info_dict["intermediate_csv"] = "tmp.updown.csv" - syscall(cmd_list) - - if info_dict["drop_index"]: - cmd_list = ["fastafunk drop_columns --in-metadata", info_dict["intermediate_csv"], - "--columns", info_dict["drop_index"], - "--out-metadata tmp.anon.csv"] - info_dict["intermediate_csv"] = "tmp.anon.csv" - syscall(cmd_list) - - - cmd_list = ["mv", info_dict["intermediate_csv"], info_dict["out_csv"]] - syscall(cmd_list) - - #tmp = glob.glob("tmp.*") - #if len(tmp) > 0: - # cmd_list = ["rm tmp.*"] - # syscall(cmd_list) - -def main(): - args = parse_args() - print(args) - fasta_dict = {"unaligned":args.unaligned_fasta, "aligned":args.aligned_fasta, "trimmed":args.trimmed_fasta, "cog_global": args.cog_global_fasta, "gisaid": args.global_fasta} - print(fasta_dict) - csv_dict = {"cog":args.cog_metadata, "cog_global":args.cog_global_metadata, "gisaid": args.global_metadata} - print(csv_dict) - mutations_file = args.mutations - print(mutations_file) - constellations_file = args.constellations - print(constellations_file) - updown_file = args.updown - print(updown_file) - - recipes = {} - with open(args.recipes, 'r') as f: - recipes = json.load(f) - - for outdir in recipes.keys(): - os.makedirs(outdir,exist_ok=True) - for recipe in recipes[outdir]: - info_dict = get_info_from_config(recipe, outdir, args.date, fasta_dict, csv_dict, mutations_file, constellations_file, updown_file) - publish_file(outdir, info_dict) - -if __name__ == '__main__': - main() diff --git a/workflows/bin/remove_duplicates_by_date.py b/workflows/bin/remove_duplicates_by_date.py deleted file mode 100755 index 30786e0..0000000 --- a/workflows/bin/remove_duplicates_by_date.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from Bio import SeqIO -import csv - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - parser.add_argument('--out-fasta', dest = 'out_fasta', required=True, help='FASTA to write out') - parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') - - args = parser.parse_args() - - return args - -def run(in_fasta, in_metadata, out_fasta, out_metadata): - dup_dict = {} - tokeep = set() - - with open(in_metadata, 'r', newline = '') as csv_in: - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - - for row in reader: - if row["why_excluded"]: - continue - - fasta_header = row["edin_header"] - id = row["sequence_name"] - epi_day = int(row["edin_epi_day"]) - completeness = float(row["unmapped_genome_completeness"]) - - if id in ["None", "", None]: - tokeep.add(fasta_header) - continue - - if id in dup_dict: - if epi_day < dup_dict[id][0]["epi_day"]: - dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) - else: - dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) - else: - dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}] - - with open("deduplicated.log", "w") as log: - for k,v in dup_dict.items(): - tokeep.add(v[0]["fasta_header"]) - if len(v) > 1: - for dup in v[1:]: - log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\n" \ - %(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \ - dup["epi_day"], dup["completeness"])) - - alignment = SeqIO.index(in_fasta, "fasta") - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out, \ - open(out_fasta, 'w') as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - fasta_header = row["edin_header"] - - if fasta_header in tokeep: - writer.writerow(row) - seqrec = alignment[fasta_header] - fasta_out.write(">" + seqrec.id + "\n") - fasta_out.write(str(seqrec.seq) + "\n") - else: - if not row["why_excluded"]: - row["why_excluded"] = "duplicate sequence_name" - writer.writerow(row) - -def main(): - args = parse_args() - run(args.in_fasta, args.in_metadata, args.out_fasta, args.out_metadata) - -if __name__ == '__main__': - main() diff --git a/workflows/bin/summarise_genomes_by_utla.py b/workflows/bin/summarise_genomes_by_utla.py deleted file mode 100755 index 86b7e24..0000000 --- a/workflows/bin/summarise_genomes_by_utla.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -import csv -from collections import defaultdict -from collections import Counter -import datetime as dt - -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--metadata", action="store") -parser.add_argument("--date", action='store') -args = parser.parse_args() - -metadata = args.metadata -file_date = args.date - -def main(metadata, file_date): - - utla_to_region = {} - utla_to_code = {} - - with open(metadata) as f: - data = csv.DictReader(f) - for l in data: - if l['utla'] != "" and "|" not in l['utla']: - utla_to_region[l['utla']] = l['NUTS1'] - utla_to_code[l['utla']] = l['utla_code'] - - utla_delta = defaultdict(list) - utla_other = defaultdict(list) - utla_all = defaultdict(list) - with open(metadata) as f: - data = csv.DictReader(f) - for l in data: - if l['sample_date'] != "": - date = dt.datetime.strptime(l['sample_date'],"%Y-%m-%d").date() - if l['utla'] != "" and "|" not in l['utla']: - if l['scorpio_call'] == "Delta (B.1.617.2-like)": - utla_delta[date].append(l['utla']) - else: - utla_other[date].append(l['utla']) - - utla_all[date].append(l['utla']) - - delta_counts = {} - other_counts = {} - all_counts = {} - - for k,v in utla_delta.items(): - delta_counts[k] = Counter(v) - - for k,v in utla_other.items(): - other_counts[k] = Counter(v) - - for k,v in utla_all.items(): - all_counts[k] = Counter(v) - - fieldnames = ["date", "utla", "utla_code", "NUTS1", "delta_count", "other_count", "total_count"] - with open(f"UTLA_genome_counts_{file_date}.csv", 'w') as fw: - writer = csv.DictWriter(fw, fieldnames=fieldnames) - writer.writeheader() - for date, utla_dict in sorted(all_counts.items()): - for utla, count in utla_dict.items(): - write_dict = {} - write_dict["date"] = date - write_dict["utla"] = utla - write_dict["utla_code"] = utla_to_code[utla] - write_dict["NUTS1"] = utla_to_region[utla] - write_dict["total_count"] = count - if date in delta_counts: - if utla in delta_counts[date]: - write_dict["delta_count"] = delta_counts[date][utla] - else: - write_dict["delta_count"] = 0 - else: - write_dict["delta_count"] = 0 - - if date in other_counts: - if utla in other_counts[date]: - write_dict["other_count"] = other_counts[date][utla] - else: - write_dict["other_count"] = 0 - else: - write_dict["other_count"] = 0 - - writer.writerow(write_dict) - - -if __name__ == '__main__': - main(metadata, file_date) \ No newline at end of file diff --git a/workflows/bin/type_aas_and_dels.py b/workflows/bin/type_aas_and_dels.py deleted file mode 100755 index 10a218f..0000000 --- a/workflows/bin/type_aas_and_dels.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from Bio import SeqIO -import csv - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add columns to metadata for specific AAs and dels""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='Aligned FASTA') - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='CSV of metadata to add to') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - parser.add_argument('--reference-fasta', dest = 'reference_fasta', required=True, help='Reference FASTA') - parser.add_argument('--aas', dest = 'aas', required=False, help='CSV of AAs') - parser.add_argument('--dels', dest = 'dels', required=False, help='CSV of deletions') - parser.add_argument('--index-column', dest = 'index_column', required=False, default='sequence_name') - - args = parser.parse_args() - return args - - -def parse_AA_file(file): - """ - input is in the format: - start (1-based) - e.g.: - D614G,1605 - - ls is a list of length-2 tuples with the format (name, position) - position is the 1-based starting position of the codon in Wuhan-Hu-1 coordinates - It has the same number of entries as lines in file - """ - ls = [] - if not file: - return ls - - with open(file, 'r') as f: - for line in f: - l = line.rstrip().split(",") - name, pos = l - ls = ls + [(name, int(pos))] - return(ls) - -def parse_del_file(file, ref_fasta): - """ - input is in the format: - start (1-based), length of deletion - e.g.: - 1605,3 - - ls is a list of length-3 tuples with the format (position, length, ref_allele) - It has the same number of entries as lines in file - """ - ls = [] - if not file: - return ls - WuhanHu1 = SeqIO.read(ref_fasta, 'fasta') - - with open(file, 'r') as f: - for line in f: - l = line.rstrip().split(',') - pos, length = l - ref_allele = str(WuhanHu1.seq).upper()[int(pos) - 1: int(pos) - 1 + int(length)] - ls = ls + [(int(pos), int(length), ref_allele)] - - return(ls) - -def type_aas_and_dels(in_fasta, in_aa_file, in_del_file, reference_fasta, in_metadata, out_metadata, index_column): - alignment = SeqIO.index(in_fasta, "fasta") - AAs = parse_AA_file(in_aa_file) - dels = parse_del_file(in_del_file, reference_fasta) - - new_aa_columns = [x[0] for x in AAs] - new_del_columns = ["del_" + str(x[0]) + "_" + str(x[1]) for x in dels] - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + new_aa_columns + new_del_columns, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - id = row[index_column] - seq = alignment[id].seq - - for entry in AAs: - pos = entry[1] - try: - QUERY_allele = seq[pos - 1: pos + 2].translate() - except: - QUERY_allele = 'X' - row[entry[0]] = QUERY_allele - - for entry in dels: - pos = entry[0] - length = entry[1] - ref_allele = entry[2] - column_name = "del_" + str(pos) + "_" + str(length) - - if seq[pos - 1: pos - 1 + length] == '-' * length: - genotype = 'del' - elif seq[pos - 1: pos - 1 + length] == ref_allele: - genotype = 'ref' - else: - genotype = 'X' - - row[column_name] = genotype - - writer.writerow(row) - -def main(): - args = parse_args() - type_aas_and_dels(args.in_fasta, args.aas, args.dels, args.reference_fasta, args.in_metadata, args.out_metadata, args.index_column) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/workflows/bin/uk_label_sourceid_duplicates_to_omit.py b/workflows/bin/uk_label_sourceid_duplicates_to_omit.py deleted file mode 100755 index d282901..0000000 --- a/workflows/bin/uk_label_sourceid_duplicates_to_omit.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from Bio import SeqIO -import csv - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - - args = parser.parse_args() - - return args - -def run(in_metadata, out_metadata): - dup_dict = {} - tokeep = set() - - with open(in_metadata, 'r', newline = '') as csv_in: - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - - for row in reader: - fasta_header = row["sequence_name"] - id = row["source_id"] - epi_day = int(row["edin_epi_day"]) - completeness = float(row["unmapped_genome_completeness"]) - - if id in ["None", "", None]: - tokeep.add(fasta_header) - continue - - if id in dup_dict: - if epi_day < dup_dict[id][0]["epi_day"]: - dup_dict[id].insert(0, {"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) - else: - dup_dict[id].append({"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}) - else: - dup_dict[id] = [{"fasta_header": fasta_header, "epi_day": epi_day, "completeness":completeness}] - - with open("deduplicated_by_sourceid.log", "w") as log: - for k,v in dup_dict.items(): - tokeep.add(v[0]["fasta_header"]) - if len(v) > 1: - for dup in v[1:]: - log.write("For id %s, %s epi_day:%s completeness:%s kept, %s epi_day:%s completeness:%s removed as duplicate\n" \ - %(k, v[0]["fasta_header"], v[0]["epi_day"], v[0]["completeness"], dup["fasta_header"], \ - dup["epi_day"], dup["completeness"])) - - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ["duplicate"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - row["duplicate"] = None - fasta_header = row["sequence_name"] - if fasta_header not in tokeep: - row["duplicate"] = "True" - writer.writerow(row) - -def main(): - args = parse_args() - run(args.in_metadata, args.out_metadata) - -if __name__ == '__main__': - main() diff --git a/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py b/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py deleted file mode 100755 index 185a345..0000000 --- a/workflows/bin/uk_remove_duplicates_COGID_by_proportionN.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from Bio import SeqIO -import csv - -def parse_args(): - parser = argparse.ArgumentParser(description="""Add sample_date, is_pillar_2 and sequence_name columns""", - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--in-metadata', dest = 'in_metadata', required=True, help='TSV from MAJORA') - parser.add_argument('--out-metadata', dest = 'out_metadata', required=True, help='CSV to write out') - parser.add_argument('--out-fasta', dest = 'out_fasta', required=True, help='FASTA to write out') - parser.add_argument('--in-fasta', dest = 'in_fasta', required=True, help='FASTA') - - args = parser.parse_args() - - return args - -def run(in_fasta, in_metadata, out_fasta, out_metadata): - alignment = SeqIO.index(in_fasta, "fasta") - - dup_dict = {} - tokeep = set() - - with open(in_metadata, 'r', newline = '') as csv_in: - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - - for row in reader: - if row["why_excluded"]: - continue - fasta_header = row["fasta_header"] - id = row["central_sample_id"] - completeness = float(row["unmapped_genome_completeness"]) - - if id in dup_dict: - if completeness > dup_dict[id]["completeness"]: - dup_dict[id] = {"fasta_header": fasta_header, "completeness": completeness} - else: - continue - else: - dup_dict[id] = {"fasta_header": fasta_header, "completeness": completeness} - - for k,v in dup_dict.items(): - tokeep.add(v["fasta_header"]) - - with open(in_metadata, 'r', newline = '') as csv_in, \ - open(out_metadata, 'w', newline = '') as csv_out, \ - open(out_fasta, 'w') as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - fasta_header = row["fasta_header"] - - if fasta_header in tokeep: - writer.writerow(row) - seqrec = alignment[fasta_header] - fasta_out.write(">" + seqrec.id + "\n") - fasta_out.write(str(seqrec.seq) + "\n") - else: - if not row["why_excluded"]: - row["why_excluded"] = "duplicate central_sample_id" - writer.writerow(row) - -def main(): - args = parse_args() - run(args.in_fasta, args.in_metadata, args.out_fasta, args.out_metadata) - -if __name__ == '__main__': - main() diff --git a/workflows/config/base.config b/workflows/config/base.config deleted file mode 100644 index df760b1..0000000 --- a/workflows/config/base.config +++ /dev/null @@ -1,64 +0,0 @@ -// Base parameters used throughout -params { - whoami = "Datapipe" - date = false - publish_dir = "publish" - publish_dev = "publish_dev" - category = "cog" - webhook = false - s3 = false - distance_qc = false - geography = false - cache_pangolin = false - - // new cog-uk files each week - uk_fasta = "test/matched.fa" - uk_metadata = "test/matched.tsv" - uk_accessions = "test/accessions.tsv" - uk_unaligned_fasta = "test/matched2.fa" // null param so exists - uk_aligned_fasta = "test/matched3.fa" // null param so exists - uk_mutations = "test/matched2.variants" // null param so exists - uk_constellations = "resources/empty_constellations.csv" // null so exists - uk_pag = "test/uk_pag.tsv" //null param - - // if carrying forward from previous - previous_metadata = "" - - // latest gisaid results output by gisaid pipeline - gisaid_json = "test/gisaid.json" - gisaid_fasta = "test/gisaid.matched.fa" // null so exists - gisaid_metadata = "test/gisaid.matched.csv" // null so exists - gisaid_mutations = "resources/empty_mutations.csv" // null so exists - gisaid_constellations = "resources/empty_constellations.csv" // null so exists - gisaid_updown = "resources/empty_updown.csv" // null so exists - - - // resources files - uk_updated_dates = "resources/date_corrections.csv" - uk_omissions = "resources/resequencing_omissions.txt" - gisaid_omissions = "resources/gisaid_omissions.txt" - reference_fasta = "resources/MN908947.fa" - reference_genbank = "resources/MN908947.gb" - WH04_fasta = "resources/WH04.fa" - aas = "resources/AAs.csv" - dels = "resources/dels.csv" - constellations = "resources/constellations" - mask_file = "resources/mask.txt" - uk_geography = "bin/geography_cleaning/geography_utils/" - publish_cog_global_recipes = "resources/publish_cog_global_recipes.json" - publish_gisaid_recipes = "resources/publish_gisaid_recipes.json" - - - // parameter values set - time_window = false - update_all_lineage_assignments = false - auto_update_pangolin = false - skip_designation_hash = false - add_usher_pangolin = false - min_covg = 93 - trim_start = 265 - trim_end = 29674 - chunk_size = 10000 - constellations = '"Delta (B.1.617.2-like)" "Omicron (B.1.1.529-like)" "Omicron (BA.1-like)" "Omicron (BA.2-like)" "Omicron (BA.3-like)" "Omicron (Unassigned)"' - -} diff --git a/workflows/environment.yml b/workflows/environment.yml deleted file mode 100644 index 77f736d..0000000 --- a/workflows/environment.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: datapipe -channels: - - bioconda - - conda-forge - - defaults - - cov-ert -dependencies: - - biopython>=1.74 - - minimap2>=2.17 - - pip=19.3.1 - - python>=3.7 - - snakemake-minimal>=6.4.1,<=6.8.0 - - gofasta<=0.0.4 - - pysam==0.16.0.1 - - usher>=0.3.2 - - coreutils>=8.25 - - nextflow - - s3cmd - - smart_open - - datafunk - - fastafunk - - pip: - - ftfy - - geopandas - - git+https://github.com/cov-lineages/pangolin.git - - git+https://github.com/cov-lineages/pangoLEARN.git - - git+https://github.com/cov-lineages/constellations.git - - git+https://github.com/cov-lineages/scorpio.git - - git+https://github.com/cov-lineages/pango-designation.git - - git+https://github.com/cov-lineages/pangolin-assigment.git - - diff --git a/workflows/environment.yml.old b/workflows/environment.yml.old deleted file mode 100644 index 5dab06f..0000000 --- a/workflows/environment.yml.old +++ /dev/null @@ -1,32 +0,0 @@ -name: datapipe -channels: - - bioconda - - conda-forge - - defaults - - cov-ert -dependencies: - - biopython>=1.74 - - minimap2>=2.17 - - pip=19.3.1 - - python>=3.7 - - snakemake-minimal>=6.4.1,<=6.8.0 - - gofasta<=0.0.4 - - pysam==0.16.0.1 - - usher>=0.3.2 - - coreutils>=8.25 - - nextflow - - s3cmd - - smart_open - - pip: - - ftfy - - geopandas - - git+https://github.com/cov-lineages/pangolin.git - - git+https://github.com/cov-lineages/pangoLEARN.git - - git+https://github.com/cov-ert/datafunk.git - - git+https://github.com/cov-ert/fastafunk.git - - git+https://github.com/cov-lineages/constellations.git - - git+https://github.com/cov-lineages/scorpio.git - - git+https://github.com/cov-lineages/pango-designation.git - - git+https://github.com/cov-lineages/pangolin-assigment.git - - diff --git a/workflows/future_improvements b/workflows/future_improvements deleted file mode 100644 index a1aa214..0000000 --- a/workflows/future_improvements +++ /dev/null @@ -1,21 +0,0 @@ -- preprocess_cog_uk takes a file of updated dates: could this be fed back into majora so -no longer needed? -- omissions file: is it really necessary still? -- what is best practice: add lots of inputs, or have global params used by processes and minimal inputs? -- in general, fix inputs/params so cast as a file/path at the right point allowing no file in some cases -- what should desired result be if missing input files e.g. list of aas/dels to search for and add to metadata table. -Set up now to skip that step -- Lots of very similar looking python scripts within processes - these were fastafunks, but were replaced -to speed up. Could instead speed up fastafunk in the same way. -- Used to retain info week to week about which samples were eliminated as duplicates - this is now done denovo each -week which is probably desirable behaviour? -- Used to have min length and min covg thresholds, now have just one lower min_covg threshold because if not tree -building don't need higher covg? -- Changes from before: when publishing use recipes - - remove mutations from consortium metadata, add them to variants metadata - - remove phylogenetics columns, make phylogenetics metadata later -- might want to publish developer info to a directory e.g. geography outputs inc new dodgy stuff -- command line help and specify required arguments -- containerize and get rid of conda environment - can be parsed down at the same time as includes things from other -steps of the old pipeline -- add back in resource requirements diff --git a/workflows/modules/align_and_variant_call.nf b/workflows/modules/align_and_variant_call.nf deleted file mode 100644 index d8fe713..0000000 --- a/workflows/modules/align_and_variant_call.nf +++ /dev/null @@ -1,560 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dir = file(params.publish_dir) -publish_dev = file(params.publish_dev) - - -process minimap2_to_reference { - /** - * Minimaps samples to reference - * @input fasta - * @output sam - * @params reference_fasta - */ - - cpus 4 - - input: - path fasta - - output: - path "alignment.sam" - - script: - """ - minimap2 -t ${task.cpus} -a --secondary=no --score-N=0 -x asm20 ${reference_fasta} ${fasta} > alignment.sam - """ -} - -process get_mutations { - /** - * Creates CSV of mutations found in each genome - * @input sam - * @output mutations - * @parms reference_fasta, reference_genbank - */ - - cpus 4 - label 'retry_increasing_mem' - - - input: - path sam - val category - - output: - path "${category}.mutations.csv" - - script: - """ - gofasta sam variants -t ${task.cpus} \ - --samfile ${sam} \ - --reference ${reference_fasta} \ - --genbank ${reference_genbank} \ - --outfile ${category}.mutations.csv - """ -} - -process get_indels { - /** - * Creates TSV of indels found in each genome - * @input sam - * @output insertions, deletions - */ - - publishDir "${publish_dev}/", pattern: "*/*.tsv", mode: 'copy' - publishDir "${publish_dir}/", pattern: "*/*.tsv", mode: 'copy', enabled: { ${category} == 'cog'} - - input: - path sam - val category - - output: - path "${category}/${category}.insertions.tsv", emit: insertions - path "${category}/${category}.deletions.tsv", emit: deletions - - script: - """ - mkdir -p ${category} - gofasta sam indels \ - -s ${sam} \ - --threshold 2 \ - --insertions-out "${category}/${category}.insertions.tsv" \ - --deletions-out "${category}/${category}.deletions.tsv" - """ -} - -process alignment { - /** - * Get reference-based alignment - * @input sam - * @output alignment - * @params reference_fasta - */ - - cpus 4 - - input: - path sam - - output: - path "alignment.fasta" - - script: - """ - gofasta sam toMultiAlign -t ${task.cpus} \ - --samfile ${sam} \ - --reference ${reference_fasta} \ - --pad \ - -o alignment.fasta - """ -} - - -process get_snps { - /** - * Call SNPs in each genome - * @input alignment - * @output snps - * @params reference_fasta - */ - - publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' - - input: - path alignment - val category - - output: - path "${category}/${category}.snps.csv" - - script: - """ - mkdir -p ${category} - gofasta snps -r ${reference_fasta} -q ${alignment} -o ${category}/${category}.snps.csv - """ -} - -process get_updown { - /** - * Call SNPs in each genome - * @input alignment - * @output updown list - * @params reference_fasta - */ - - publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' - - input: - path alignment - val category - - output: - path "${category}/${category}.updown.csv" - - script: - """ - mkdir -p ${category} - gofasta updown list -r ${WH04_fasta} -q ${alignment} -o ${category}/${category}.updown.csv - """ -} - -process type_AAs_and_dels { - /** - * Adds a column to metadata table for specific dels and aas looked for - * @input alignment, metadata - * @output metadata_updated - * @params reference_fasta, del_file, aa_file - */ - - input: - path alignment - path metadata - - output: - path "${metadata.baseName}.aas_dels.csv" - - script: - """ - $project_dir/../bin/type_aas_and_dels.py \ - --in-fasta ${alignment} \ - --in-metadata ${metadata} \ - --out-metadata "mutations.tmp.csv" \ - --reference-fasta ${reference_fasta} \ - --aas ${aas} \ - --dels ${dels} \ - --index-column query - sed "s/query/sequence_name/g" "mutations.tmp.csv" > mutations.tmp2.csv - sed "s/variants/mutations/g" "mutations.tmp2.csv" > "${metadata.baseName}.aas_dels.csv" - - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.aas_dels.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${metadata.baseName}.aas_dels.csv" | wc -l) - exit 1 - fi - """ -} - -process get_nuc_mutations { - /** - * Combines nucleotide mutations into a metadata file which can be merged into the master - * @input snps, dels, ins - * @output metadata - */ - - input: - path snps - path dels - path ins - - output: - path "nuc_mutations.csv" - - script: - """ - #!/usr/bin/env python3 - import csv - - sample_dict = {} - with open("${dels}", 'r', newline = '') as csv_in: - for line in csv_in: - ref_start, length, samples = line.strip().split() - samples = samples.split('|') - var = "del_%s_%s" %(ref_start, length) - for sample in samples: - if sample in sample_dict: - sample_dict[sample].append(var) - else: - sample_dict[sample] = [var] - - with open("${ins}", 'r', newline = '') as csv_in: - for line in csv_in: - ref_start, insertion, samples= line.strip().split() - samples = samples.split('|') - var = "ins_%s_%s" %(ref_start, insertion) - for sample in samples: - if sample in sample_dict: - sample_dict[sample].append(var) - else: - sample_dict[sample] = [var] - - with open("${snps}", 'r', newline = '') as csv_in, \ - open("nuc_mutations.csv", 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = ["sequence_name", "nucleotide_mutations"], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - row["sequence_name"] = row["query"] - row["nucleotide_mutations"] = row["SNPs"] - if row["sequence_name"] in sample_dict: - all_vars = [row["nucleotide_mutations"]] - all_vars.extend(sample_dict[row["sequence_name"]]) - row["nucleotide_mutations"] = '|'.join(all_vars) - for key in [k for k in row if k not in ["sequence_name", "nucleotide_mutations"]]: - del row[key] - writer.writerow(row) - """ -} - - -process restrict_metadata { - /** - * restricts only to sequences not excluded - * @input metadata - * @output metadata - */ - - input: - path metadata - - output: - path "${metadata.baseName}.restricts.csv" - - script: - """ - #!/usr/bin/env python3 - import csv - - with open("${metadata}", 'r', newline = '') as csv_in, \ - open("${metadata.baseName}.restricts.csv", 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - if row["why_excluded"] not in [None, "", "None"]: - writer.writerow(row) - """ -} - - -process add_nucleotide_mutations_to_metadata { - /** - * Adds nucleotide mutations to metadata - * @input metadata, nucleotide_mutations - * @output metadata - */ - - memory { 1.GB * task.attempt + metadata.size() * 2.B } - - input: - path metadata - path nucleotide_mutations - - output: - path "${metadata.baseName}.with_nuc_mutations.csv" - - script: - """ - fastafunk add_columns \ - --in-metadata ${metadata} \ - --in-data ${nucleotide_mutations} \ - --index-column sequence_name \ - --join-on sequence_name \ - --new-columns nucleotide_mutations \ - --out-metadata "${metadata.baseName}.with_nuc_mutations.csv" - - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.with_nuc_mutations.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${metadata.baseName}.with_nuc_mutations.csv" | wc -l) - exit 1 - fi - """ -} - -process add_ambiguities_to_metadata { - /** - * Adds nucleotide mutations to metadata - * @input metadata, nucleotide_mutations - * @output metadata - */ - - memory { 1.GB * task.attempt + metadata.size() * 2.B } - publishDir "${publish_dev}/", pattern: "*/*.csv", mode: 'copy' - - input: - path metadata - path updown - val category - - output: - path "${category}/${category}_mutations.csv" - - script: - """ - mkdir -p ${category} - fastafunk add_columns \ - --in-metadata ${metadata} \ - --in-data ${updown} \ - --index-column sequence_name \ - --join-on query \ - --new-columns ambiguities \ - --out-metadata "${category}/${category}_mutations.csv" - - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${category}/${category}_mutations.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${category}/${category}_mutations.csv" | wc -l) - exit 1 - fi - """ -} - - -process haplotype_constellations { - /** - * Adds a column to metadata table for each constellation, and a summary column for all found - * @input alignment - * @output haplotype_csv - * @params constellations - */ - - input: - path alignment - - output: - path "${alignment.baseName}.haplotyped.csv" - - script: - """ - scorpio haplotype \ - --input ${alignment} \ - --output "${alignment.baseName}.haplotyped.csv" \ - --output-counts \ - -n ${params.constellations} - - if [[ \$(grep ">" "${alignment}" | wc -l) != \$(tail -n+2 "${alignment.baseName}.haplotyped.csv" | wc -l) ]] - then - echo \$(grep ">" "${alignment}" | wc -l) - echo \$(tail -n+2 "${alignment.baseName}.haplotyped.csv" | wc -l) - exit 1 - fi - """ -} - -process classify_constellations { - /** - * Adds a column to metadata table for each constellation, and a summary column for all found - * @input alignment - * @output classify_csv - * @params constellations - */ - - input: - path alignment - - output: - path "${alignment.baseName}.classified.csv" - - script: - """ - scorpio classify \ - --input ${alignment} \ - --output "${alignment.baseName}.classified.csv" \ - -n ${params.constellations} - - if [[ \$(grep ">" "${alignment}" | wc -l) != \$(tail -n+2 "${alignment.baseName}.classified.csv" | wc -l) ]] - then - echo \$(grep ">" "${alignment}" | wc -l) - echo \$(tail -n+2 "${alignment.baseName}.classified.csv" | wc -l) - exit 1 - fi - """ -} - -process add_constellations_to_metadata { - /** - * Adds constellations to metadata - * @input metadata, haplotyped, classified - * @output metadata - */ - - publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' - - memory { task.attempt * (classified.size() + haplotyped.size()) * 9.B } - - input: - path haplotyped - path classified - val category - - output: - path "${category}/${category}_constellations.csv" - - script: - """ - mkdir -p ${category} - fastafunk add_columns \ - --in-metadata ${classified} \ - --in-data ${haplotyped} \ - --index-column query \ - --join-on query \ - --out-metadata "constellations.tmp.csv" - sed "s/query/sequence_name/g" "constellations.tmp.csv" > "${category}/${category}_constellations.csv" - - if [[ \$(cat "${haplotyped}" | wc -l) != \$(cat "${category}/${category}_constellations.csv" | wc -l) ]] - then - echo \$(cat "${haplotyped}" | wc -l) - echo \$(cat "${category}/${category}_constellations.csv" | wc -l) - exit 1 - fi - """ -} - - -process announce_summary { - /** - * Summarizes alignment into JSON - * @input fastas - */ - - input: - path fasta - path alignment - - output: - path "announce.json" - - script: - if (params.webhook) - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Finished alignment and variant calling ${params.date}*\\n" >> announce.json - echo "> Number of sequences in FASTA : \$(cat ${fasta} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences in ALIGNMENT : \$(cat ${alignment} | grep '>' | wc -l)\\n" >> announce.json - echo '"}' >> announce.json - - echo 'webhook ${params.webhook}' - - curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} - """ - else - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Finished alignment and variant calling ${params.date}*\\n" >> announce.json - echo "> Number of sequences in FASTA : \$(cat ${fasta} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences in ALIGNMENT : \$(cat ${alignment} | grep '>' | wc -l)\\n" >> announce.json - echo '"}' >> announce.json - """ -} - -workflow align_and_variant_call { - take: - in_fasta - in_metadata - category - main: - in_fasta.splitFasta( by: params.chunk_size, file: true ).set{ fasta_chunks } - minimap2_to_reference(fasta_chunks) - alignment(minimap2_to_reference.out) - alignment.out.collectFile(newLine: false).set{ alignment_result } - minimap2_to_reference.out.collectFile(newLine: false, keepHeader: true, skip: 2).set{ mapped_result } - - - get_mutations(mapped_result, category) - get_indels(mapped_result, category) - - get_snps(alignment_result, category) - get_updown(alignment_result, category) - type_AAs_and_dels(alignment_result, get_mutations.out) - get_nuc_mutations(get_snps.out, get_indels.out.deletions, get_indels.out.insertions) - add_nucleotide_mutations_to_metadata(in_metadata, get_nuc_mutations.out) - add_ambiguities_to_metadata(type_AAs_and_dels.out, get_updown.out, category) - - haplotype_constellations(alignment.out) - haplotype_constellations.out.collectFile(newLine: false, keepHeader: true, skip: 1).set{ haplotype_result } - classify_constellations(alignment.out) - classify_constellations.out.collectFile(newLine: false, keepHeader: true, skip: 1).set{ classify_result } - - add_constellations_to_metadata(haplotype_result, classify_result, category) - announce_summary(in_fasta, alignment_result) - emit: - mutations = add_ambiguities_to_metadata.out - constellations = add_constellations_to_metadata.out - fasta = alignment_result - metadata = add_nucleotide_mutations_to_metadata.out - updown = get_updown.out -} - - -aas = file(params.aas) -dels = file(params.dels) -reference_fasta = file(params.reference_fasta) -reference_genbank = file(params.reference_genbank) -WH04_fasta = file(params.WH04_fasta) - -workflow { - uk_fasta = Channel.fromPath(params.uk_fasta) - uk_metadata = Channel.fromPath(params.uk_metadata) - category = params.category - - align_and_variant_call(uk_fasta, uk_metadata, category) -} diff --git a/workflows/modules/clean_geography.nf b/workflows/modules/clean_geography.nf deleted file mode 100644 index 41d7939..0000000 --- a/workflows/modules/clean_geography.nf +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dir = file(params.publish_dir) -publish_dev = file(params.publish_dev) - - -process uk_geography { - /** - * Cleans up geography - * @input uk_fasta, uk_metadata - * @output geography_metadata - * @params geography_utils - */ - - memory { 1.GB * task.attempt + uk_fasta.size() * 1.B } - errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - maxRetries = 1 - - publishDir "${publish_dev}/", pattern: "geography/*.csv", mode: 'copy' - publishDir "${publish_dev}/", pattern: "geography/*.txt", mode: 'copy' - - input: - path uk_fasta - path uk_metadata - - output: - path "geography/geography.csv", emit: geography - path "geography/*.csv" - path "geography/*.txt" - - script: - """ - mkdir geography - mkdir geography_tmp - - fastafunk fetch \ - --in-fasta ${uk_fasta} \ - --in-metadata ${uk_metadata} \ - --index-column sequence_name \ - --filter-column central_sample_id sequence_name sample_date edin_epi_week \ - adm0 adm1 adm2 adm2_private \ - --out-fasta geography_tmp/fetch.fa \ - --out-metadata geography_tmp/fetch.csv \ - --restrict - - $project_dir/../bin/geography_cleaning/geography_cleaning.py \ - --metadata geography_tmp/fetch.csv \ - --country-col adm0 \ - --adm1-col adm1 \ - --adm2-col adm2 \ - --outer-postcode-col adm2_private \ - --mapping-utils-dir ${geography_utils} \ - --epiweek-col edin_epi_week \ - --outdir geography - - #rm -rf geography_tmp - """ -} - - -process add_uk_geography_to_metadata { - /** - * Adds UK geography to uk metadata - * @input combined_metadata, geography_metadata - * @output metadata - */ - - publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_master.csv"} - memory { 1.GB * task.attempt + uk_metadata.size() * 2.B } - - input: - path uk_metadata - path geography_metadata - - output: - path "cog_geography.csv", emit: metadata - - script: - """ - fastafunk add_columns \ - --in-metadata ${uk_metadata} \ - --in-data ${geography_metadata} \ - --index-column sequence_name \ - --join-on sequence_name \ - --force-overwrite \ - --new-columns adm1 adm1_raw adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ - --out-metadata "cog_geography.csv" - """ -} - - -process gisaid_geography { - /** - * Cleans up geography - * @input gisaid_fasta, gisaid_metadata - * @output geography_metadata - * @params geography_utils - */ - - memory { 1.GB * task.attempt + fasta.size() * 1.B } - errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - maxRetries = 1 - - publishDir "${publish_dev}/", pattern: "geography/*.csv", mode: 'copy' - publishDir "${publish_dev}/", pattern: "geography/*.txt", mode: 'copy' - - input: - path gisaid_fasta - path gisaid_metadata - - output: - path "geography/geography.csv", emit: geography - path "geography/*.csv" - path "geography/*.txt" - - script: - """ - mkdir geography - mkdir geography_tmp - - fastafunk fetch \ - --in-fasta ${fasta} \ - --in-metadata ${metadata} \ - --index-column sequence_name \ - --filter-column gisaid_accession sequence_name sample_date epi_week \ - adm0 adm1 adm2 adm2_private \ - --where-column gisaid_accession=covv_accession_id epi_week=edin_epi_week adm0=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2\ - --out-fasta geography_tmp/fetch.fa \ - --out-metadata geography_tmp/fetch.csv \ - --restrict - - $project_dir/../bin/geography_cleaning/geography_cleaning.py \ - --metadata geography_tmp/fetch.csv \ - --country-col adm0 \ - --adm1-col adm1 \ - --adm2-col adm2 \ - --outer-postcode-col adm2_private \ - --mapping-utils-dir ${geography_utils} \ - --epiweek-col epi_week \ - --sample-id-col gisaid_accession \ - --outdir geography - - rm -rf geography_tmp - """ -} - - -process add_gisaid_geography_to_metadata { - /** - * Adds GISAID geography to combined metadata - * @input gisaid_metadata, geography_metadata - * @output metadata - */ - - publishDir "${publish_dev}/gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"gisaid_master.csv"}, overwrite: true - memory { 1.GB * task.attempt + combined_metadata.size() * 2.B } - - input: - path gisaid_metadata - path geography_metadata - - output: - path "gisaid_geography.csv", emit: metadata - - script: - """ - fastafunk add_columns \ - --in-metadata ${gisaid_metadata} \ - --in-data ${geography_metadata} \ - --index-column sequence_name \ - --join-on sequence_name \ - --force-overwrite \ - --new-columns edin_admin_0 edin_admin_1 edin_admin_2 adm1 adm1_raw adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ - --where-column edin_admin_0=adm0 edin_admin_1=adm1 edin_admin_2=adm2 \ - --out-metadata "gisaid_geography.csv" - """ -} - - -process make_delta_by_utla_summary { - /** - * Summarizes delta counts by utla - * @input metadata - * @output csv - */ - - publishDir "${publish_dir}/cog", pattern: "*.csv", mode: 'copy', overwrite: false - - input: - path metadata - - output: - path "UTLA_genome_counts_${params.date}.csv" - - script: - """ - $project_dir/../bin/summarise_genomes_by_utla.py \ - --metadata ${metadata} \ - --date ${params.date} - """ -} - - -process drop_anon_id { - /** - * Drops anonymous ID from master metadata csv - * @input metadta - * @output metadata - */ - - input: - path metadata - - output: - path "${metadata.baseName}_anon.csv" - - script: - """ - fastafunk drop_columns --in-metadata ${metadata} --columns anonymous_sample_id --out-metadata ${metadata.baseName}_anon.csv - """ -} - -process publish_master_metadata { - /** - * Publishes master metadata csv for this category - * @input metadata - * @output metadata - */ - - publishDir "${publish_dev}", pattern: "*/*.csv", mode: 'copy' - - input: - path metadata - val category - - output: - path "${category}/${category}_master.csv" - - script: - """ - mkdir -p ${category} - cp ${metadata} ${category}/${category}_master.csv - """ -} - - -geography_utils = file(params.uk_geography) - - -workflow clean_geography_cog_uk { - take: - uk_fasta - uk_metadata - main: - uk_geography(uk_fasta, uk_metadata) - add_uk_geography_to_metadata(uk_metadata,uk_geography.out.geography) - make_delta_by_utla_summary(add_uk_geography_to_metadata.out.metadata) - drop_anon_id(add_uk_geography_to_metadata.out.metadata) - publish_master_metadata(drop_anon_id.out, "cog") - emit: - metadata = add_uk_geography_to_metadata.out.metadata -} - -workflow clean_geography_gisaid { - take: - gisaid_fasta - gisaid_metadata - main: - if ( params.geography ){ - gisaid_geography(gisaid_fasta, gisaid_metadata) - add_gisaid_geography_to_metadata(gisaid_metadata,gisaid_geography.out.geography) - add_gisaid_geography_to_metadata.out.metadata.set{ new_gisaid_metadata } - } else { - new_gisaid_metadata = gisaid_metadata - } - publish_master_metadata(new_gisaid_metadata, "gisaid") - emit: - metadata = new_gisaid_metadata -} - -workflow { - uk_fasta = Channel.fromPath(params.uk_fasta) - uk_metadata = Channel.fromPath(params.uk_metadata) - clean_geography_cog_uk(uk_fasta, uk_metadata) -} diff --git a/workflows/modules/deduplicate.nf b/workflows/modules/deduplicate.nf deleted file mode 100644 index 7cc7397..0000000 --- a/workflows/modules/deduplicate.nf +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dev = file(params.publish_dev) - - -process annotate_with_unmapped_genome_completeness { - /** - * Adds a column to metadata with proportion of genome which is complete - * @input fasta, metadata - * @output metadata - */ - - input: - path fasta - path metadata - - output: - path "${metadata.baseName}.annotated.csv" - - script: - """ - $project_dir/../bin/annotate_with_unmapped_genome_completeness.py \ - --in-fasta ${fasta} \ - --in-metadata ${metadata} \ - --out-metadata "${metadata.baseName}.annotated.csv" - - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.annotated.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${metadata.baseName}.annotated.csv" | wc -l) - exit 1 - fi - """ -} - -process uk_remove_duplicates_COGID_by_proportionN { - /** - * Where duplicate COGID, keeps the most complete - * @input uk_fasta, uk_metadata - * @output uk_fasta_updated, uk_metadata_updated - */ - - input: - path uk_fasta - path uk_metadata - - output: - path "${uk_fasta.baseName}.deduplicated_by_cogid.fa", emit: uk_fasta_updated - path "${uk_metadata.baseName}.deduplicated_by_cogid.csv", emit: uk_metadata_updated - - script: - """ - $project_dir/../bin/uk_remove_duplicates_COGID_by_proportionN.py \ - --in-fasta ${uk_fasta} \ - --in-metadata ${uk_metadata} \ - --out-fasta "${uk_fasta.baseName}.deduplicated_by_cogid.fa" \ - --out-metadata "${uk_metadata.baseName}.deduplicated_by_cogid.csv" - - if [[ \$(cat "${uk_metadata}" | wc -l) != \$(cat "${uk_metadata.baseName}.deduplicated_by_cogid.csv" | wc -l) ]] - then - echo \$(cat "${uk_metadata}" | wc -l) - echo \$(cat "${uk_metadata.baseName}.deduplicated_by_cogid.csv" | wc -l) - exit 1 - fi - """ -} - - -process remove_duplicates_by_date { - /** - * Where duplicate sequence_name, keeps the earliest - * @input fasta, metadata - * @output fasta_updated, metadata_updated - */ - - memory { 1.GB * task.attempt + metadata.size() * 2.B } - - input: - path fasta - path metadata - - output: - path "${fasta.baseName}.deduplicated.fa", emit: fasta_updated - path "${metadata.baseName}.deduplicated.csv", emit: metadata_updated - - script: - """ - $project_dir/../bin/remove_duplicates_by_date.py \ - --in-fasta ${fasta} \ - --in-metadata ${metadata} \ - --out-fasta "${fasta.baseName}.deduplicated.fa" \ - --out-metadata "${metadata.baseName}.deduplicated.csv" - - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.deduplicated.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${metadata.baseName}.deduplicated.csv" | wc -l) - exit 1 - fi - """ -} - - -process unify_headers { - input: - path fasta - path metadata - - output: - path "${fasta.baseName}.UH.fa" - - script: - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import csv - - alignment = SeqIO.index("${fasta}", "fasta") - - with open("${metadata}", 'r', newline = '') as csv_in, \ - open("${fasta.baseName}.UH.fa", "w") as fasta_out: - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - id_key = "fasta_header" - if "edin_header" in reader.fieldnames: - id_key = "edin_header" - for row in reader: - if row["why_excluded"]: - print("excluded") - continue - if row[id_key] in alignment: - record = alignment[row[id_key]] - fasta_out.write(">" + row["sequence_name"] + "\\n") - fasta_out.write(str(record.seq) + "\\n") - else: - print(id_key, row[id_key]) - """ -} - - -process uk_label_sourceid_duplicates_to_omit { - /** - * Where duplicate source_id, labels all but the earliest as duplicates - * @input uk_fasta, uk_metadata - * @output uk_fasta_updated, uk_metadata_updated - */ - - publishDir "${publish_dev}/cog_gisaid/", pattern: "*.log", mode: 'copy' - - input: - path uk_metadata - - output: - path "${uk_metadata.baseName}.deduplicated_by_sourceid.csv", emit: uk_metadata_updated - path "deduplicated_by_sourceid.log", emit: deduplicate_log - - script: - """ - $project_dir/../bin/uk_label_sourceid_duplicates_to_omit.py \ - --in-metadata ${uk_metadata} \ - --out-metadata "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" - - if [[ \$(cat "${uk_metadata}" | wc -l) != \$(cat "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" | wc -l) ]] - then - echo \$(cat "${uk_metadata}" | wc -l) - echo \$(cat "${uk_metadata.baseName}.deduplicated_by_sourceid.csv" | wc -l) - exit 1 - fi - """ -} - - -workflow deduplicate_cog_uk { - take: - uk_fasta - uk_metadata - main: - annotate_with_unmapped_genome_completeness(uk_fasta, uk_metadata) - uk_remove_duplicates_COGID_by_proportionN(uk_fasta, annotate_with_unmapped_genome_completeness.out) - unify_headers(uk_remove_duplicates_COGID_by_proportionN.out.uk_fasta_updated, uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated) - uk_label_sourceid_duplicates_to_omit(uk_remove_duplicates_COGID_by_proportionN.out.uk_metadata_updated) - emit: - fasta = unify_headers.out - metadata = uk_label_sourceid_duplicates_to_omit.out.uk_metadata_updated -} - - -workflow deduplicate_gisaid { - take: - gisaid_fasta - gisaid_metadata - main: - annotate_with_unmapped_genome_completeness(gisaid_fasta, gisaid_metadata) - remove_duplicates_by_date(gisaid_fasta, annotate_with_unmapped_genome_completeness.out) - unify_headers(remove_duplicates_by_date.out.fasta_updated, remove_duplicates_by_date.out.metadata_updated) - emit: - fasta = unify_headers.out - metadata = remove_duplicates_by_date.out.metadata_updated -} - - -workflow { - uk_fasta = file(params.uk_fasta) - uk_metadata = file(params.uk_metadata) - deduplicate_cog_uk(uk_fasta, uk_metadata) -} diff --git a/workflows/modules/filter_and_trim.nf b/workflows/modules/filter_and_trim.nf deleted file mode 100644 index e454587..0000000 --- a/workflows/modules/filter_and_trim.nf +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dev = file(params.publish_dev) - - -process filter_low_coverage_sequences { - /** - * Keeps only sequences with completeness greater than min_covg threshold - * @input alignment, metadata - * @output alignment_updated, metadata_updated - * @params min_covg - */ - - input: - path alignment - path metadata - - output: - path "${alignment.baseName}.low_covg_filtered.fasta", emit: fasta_updated - path "${metadata.baseName}.low_covg_filtered.csv", emit: metadata_updated - - script: - if (!params.min_covg) - """ - mv "${alignment}" "${alignment.baseName}.low_covg_filtered.fasta" - mv "${metadata}" "${metadata.baseName}.low_covg_filtered.csv" - """ - else - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import csv - - alignment = SeqIO.index("${alignment}", "fasta") - - with open("${metadata}", 'r', newline = '') as csv_in, \ - open("${metadata.baseName}.low_covg_filtered.csv", 'w', newline = '') as csv_out, \ - open("${alignment.baseName}.low_covg_filtered.fasta", 'w') as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - if row["why_excluded"]: - writer.writerow(row) - continue - id = row["sequence_name"] - if id in alignment: - seq = str(alignment[id].seq) - mapped_completeness = float(len(seq.replace("N", "")) / len(seq)) - if mapped_completeness >= float(${params.min_covg} / 100): - writer.writerow(row) - fasta_out.write(">" + id + "\\n") - fasta_out.write(seq + "\\n") - else: - row["why_excluded"] = "low mapped_completeness" - writer.writerow(row) - """ -} - - -process trim_alignment { - /** - * Trims start and end of alignment - * @input alignment - * @output alignment_updated - * @params trim_start, trim_end - */ - - input: - path alignment - - output: - path "${alignment.baseName}.trimmed.fa" - - script: - if (params.trim_start && params.trim_end) - """ - #!/usr/bin/env python3 - from Bio import SeqIO - - strt = int(${params.trim_start}) - stp = int(${params.trim_end}) - - with open("${alignment}", "r") as fasta_in, \ - open("${alignment.baseName}.trimmed.fa", "w") as fasta_out: - - for record in SeqIO.parse(fasta_in, "fasta"): - seq = str(record.seq).upper() - new_seq = ("N" * strt) + seq[strt:stp] + ("N" * (len(seq) - stp)) - fasta_out.write(">" + record.id + "\\n") - fasta_out.write(new_seq + "\\n") - """ - else - """ - mv "${alignment.baseName}" "${alignment.baseName}.trimmed.fa" - """ -} - - -process distance_QC { - /** - * Outputs number of sequences per country - * @input fasta, metadata - * @output "QC_distances.tsv" - */ - publishDir "${publish_dev}", pattern: "*/*.tsv", mode: 'copy' - - - input: - path fasta - path metadata - val category - - output: - path "${category}/${category}_QC_distances.tsv" - - script: - """ - datafunk distance_to_root \ - --input-fasta ${fasta} \ - --input-metadata ${metadata} - - mkdir -p ${category} - mv distances.tsv "${category}/${category}_QC_distances.tsv" - """ -} - - -process filter_on_distance_to_WH04 { - /** - * Restricts to samples within distance x of WH04 - * @input fasta, metadata, distances - * @output - */ - - input: - path fasta - path metadata - path distances - - output: - path "${fasta.baseName}.distance_filtered.fa", emit: fasta - path "${metadata.baseName}.distance_filtered.csv", emit: metadata - - script: - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import csv - - reject = set() - with open("${distances}", 'r', newline = '') as distances_in: - reader = csv.DictReader(distances_in, delimiter="\t", quotechar='\"', dialect = "unix") - for row in reader: - sequence_name = row['sequence_name'] - distance = float(row['distance_stdevs']) - if distance >= 4.0: - reject.add(sequence_name) - - alignment = SeqIO.index("${fasta}", "fasta") - - with open("${metadata}", 'r', newline = '') as csv_in, \ - open("${metadata.baseName}.distance_filtered.csv", 'w', newline = '') as csv_out, \ - open("${fasta.baseName}.distance_filtered.fa", 'w') as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - if row["why_excluded"]: - writer.writerow(row) - continue - id = row["sequence_name"] - if id in reject: - row["why_excluded"] = "distance to WH04 more than 4.0 epi-week std devs" - writer.writerow(row) - continue - if id in alignment: - writer.writerow(row) - seq = str(alignment[id].seq) - fasta_out.write(">" + id + "\\n") - fasta_out.write(seq + "\\n") - """ -} - - -workflow filter_and_trim_gisaid { - take: - gisaid_fasta - gisaid_metadata - main: - filter_low_coverage_sequences(gisaid_fasta, gisaid_metadata) - trim_alignment(filter_low_coverage_sequences.out.fasta_updated) - if ( params.distance_qc ){ - distance_QC(trim_alignment.out, filter_low_coverage_sequences.out.metadata_updated, "gisaid") - filter_on_distance_to_WH04(gisaid_fasta, gisaid_metadata, distance_QC.out) - ch_fasta = filter_on_distance_to_WH04.out.fasta - ch_metadata = filter_on_distance_to_WH04.out.metadata - } else { - ch_fasta = trim_alignment.out - ch_metadata = filter_low_coverage_sequences.out.metadata_updated - } - emit: - fasta = ch_fasta - metadata = ch_metadata -} - - -workflow filter_and_trim_cog_uk { - take: - uk_fasta - uk_metadata - main: - filter_low_coverage_sequences(uk_fasta, uk_metadata) - trim_alignment(filter_low_coverage_sequences.out.fasta_updated) - if ( params.distance_qc ){ - distance_QC(trim_alignment.out, filter_low_coverage_sequences.out.metadata_updated, "cog") - filter_on_distance_to_WH04(uk_fasta, uk_metadata, distance_QC.out) - ch_fasta = filter_on_distance_to_WH04.out.fasta - ch_metadata = filter_on_distance_to_WH04.out.metadata - } else { - ch_fasta = trim_alignment.out - ch_metadata = filter_low_coverage_sequences.out.metadata_updated - } - emit: - fasta = ch_fasta - metadata = ch_metadata -} - -workflow { - uk_fasta = file(params.uk_fasta) - uk_metadata = file(params.uk_metadata) - - filter_and_trim_cog_uk(uk_fasta, - uk_metadata) -} diff --git a/workflows/modules/pangolin.nf b/workflows/modules/pangolin.nf deleted file mode 100644 index a1a814c..0000000 --- a/workflows/modules/pangolin.nf +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dir = file(params.publish_dir) - - -process check_for_pangolin_update { - /** - * Checks if there is a new version of pangolin and sets param flag if there is - */ - output: - env PANGOLIN_UPDATED - - script: - if ( params.auto_update_pangolin ) - """ - PANGO_VERSION=\$(pangolin --all-versions) - echo \$PANGO_VERSION - pangolin --update - sleep 5s - NEW_PANGO_VERSION=\$(pangolin --all-versions) - echo \$NEW_PANGO_VERSION - if [[ "\$PANGO_VERSION" == "\$NEW_PANGO_VERSION" ]]; then - PANGOLIN_UPDATED=false - else - PANGOLIN_UPDATED=true - fi - """ - else - """ - PANGOLIN_UPDATED=false - """ - -} - - -process extract_sequences_for_pangolin { - /** - * If update_all_lineage_assignments flag set, or no previous provided, outputs the input files. - * Otherwise, extracts lineageless sequences from FASTA to run pangolin on, and updates - * metadata with previous lineages - * @input fasta, metadata - * @output pangolin_fasta, metadata_with_previous - * @params previous_metadata, update_all_lineage_assignments - */ - memory {task.attempt * 6.GB} - - input: - path fasta - path metadata - env PANGOLIN_UPDATED - - output: - path "${fasta.baseName}.for_pangolin.fa", emit: pangolin_fasta - path "${metadata.baseName}.with_previous.csv", emit: metadata_with_previous - - script: - if (params.update_all_lineage_assignments || !params.previous_metadata ) - """ - mv "${fasta}" "${fasta.baseName}.for_pangolin.fa" - mv "${metadata}" "${metadata.baseName}.with_previous.csv" - """ - else - """ - echo "Pangolin updated: \$PANGOLIN_UPDATED" - if [ \$PANGOLIN_UPDATED == "true" ] - then - mv "${fasta}" "${fasta.baseName}.for_pangolin.fa" - mv "${metadata}" "${metadata.baseName}.with_previous.csv" - else - $project_dir/../bin/prepare_for_pangolin.py \ - --in-fasta ${fasta} \ - --in-metadata ${metadata} \ - --previous-metadata ${params.previous_metadata} \ - --out-fasta "${fasta.baseName}.for_pangolin.fa" \ - --out-metadata "${metadata.baseName}.with_previous.csv" - if [[ \$(cat "${metadata}" | wc -l) != \$(cat "${metadata.baseName}.with_previous.csv" | wc -l) ]] - then - echo \$(cat "${metadata}" | wc -l) - echo \$(cat "${metadata.baseName}.with_previous.csv" | wc -l) - exit 1 - fi - fi - """ -} - -process run_pangolin { - /** - * Runs PANGOLIN on input fasta - * @input fasta - * @output pangolin_fasta - */ - cpus 4 - memory { task.attempt * 8.GB } - - input: - path fasta - - output: - path "pangolin/lineage_report.csv", emit: report - //path "pangolin/sequences.aln.fasta", emit: alignment - - script: - if (params.skip_designation_hash) - """ - pangolin "${fasta}" \ - --outdir pangolin \ - --tempdir pangolin_tmp \ - --alignment \ - --analysis-mode fast \ - --skip-designation-hash \ - -t ${task.cpus} - """ - else - """ - pangolin "${fasta}" \ - --outdir pangolin \ - --tempdir pangolin_tmp \ - --alignment \ - --analysis-mode fast \ - -t ${task.cpus} - """ -} - -process run_pangolin_usher { - /** - * Runs PANGOLIN on input fasta - * @input fasta - * @output pangolin_fasta - */ - - cpus 16 - - input: - path fasta - - output: - path "pangolin/usher_lineage_report.csv" - - script: - if (params.skip_designation_hash) - """ - pangolin "${fasta}" \ - --outdir pangolin \ - --tempdir pangolin_tmp \ - --outfile usher_lineage_report.csv \ - --usher \ - -t ${task.cpus} \ - --skip-designation-hash - """ - else - """ - pangolin "${fasta}" \ - --outdir pangolin \ - --tempdir pangolin_tmp \ - --outfile usher_lineage_report.csv \ - --usher -t ${task.cpus} - """ -} - -process add_new_pangolin_lineages_to_metadata { - /** - * Updates metadata with new PANGOLIN lineage assignments - * @input metadata, pangolin_csv - * @output metadata_updated - */ - - memory { task.attempt * metadata.size() * 3.B } - - input: - path metadata - path pangolin_csv - - output: - path "${metadata.baseName}.with_pangolin.csv", emit: metadata - path "pango.log", emit: log - - script: - """ - $project_dir/../bin/prepare_for_pangolin.py \ - --in-metadata ${metadata} \ - --previous-metadata ${pangolin_csv} \ - --out-metadata "${metadata.baseName}.with_pangolin.csv" - """ -} - -process add_pangolin_usher_to_metadata { - /** - * Adds usher pangolin calls to metadata - * @input metadata, usher report - * @output metadata - */ - - input: - path metadata - path usher_report - - output: - path "${metadata.baseName}.with_usher.csv" - - script: - """ - fastafunk add_columns \ - --in-metadata ${metadata} \ - --in-data ${usher_report} \ - --index-column taxon \ - --join-on taxon \ - --new-columns usher_lineage usher_lineages_version \ - --where-column usher_lineage=lineage usher_lineages_version=version \ - --out-metadata "${metadata.baseName}.with_usher.csv" - """ -} - -process cache_lineages_report { - /** - * Creates a map from sequence hash to pangolin report calls - * @input metadata - * @output metadata - */ - publishDir "${publish_dir}/pangolin", pattern: "*.cache.csv", mode: 'copy' - - input: - path fasta - path metadata - - output: - path "${metadata.baseName}.cache.csv", emit: metadata - - script: - """ - $project_dir/../bin/cache_pangolin_report.py \ - --in-fasta ${fasta} \ - --in-metadata ${metadata} \ - --out-metadata "${metadata.baseName}.cache.csv" - """ -} - - -process announce_summary { - /** - * Summarizes pangolin into JSON - * @input fastas - */ - - input: - path pango_input - path pango_log - - output: - path "announce.json" - - script: - if (params.webhook) - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Finished running pangolin ${params.date}*\\n" >> announce.json - echo "> Number of sequences input to pangolin for new lineage assignments : \$(cat ${pango_input} | grep '>' | wc -l)\\n" >> announce.json - echo "> \$(cat ${pango_log})\\n" >> announce.json - echo '"}' >> announce.json - - echo 'webhook ${params.webhook}' - - curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} - """ - else - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Finished running pangolin ${params.date}*\\n" >> announce.json - echo "> Number of sequences input to pangolin for new lineage assignments : \$(cat ${pango_input} | grep '>' | wc -l)\\n" >> announce.json - echo "> \$(cat ${pango_log})\\n" >> announce.json - echo '"}' >> announce.json - """ -} - - -process publish_metadata { - /** - * Publishes metadata csv for this category - * @input metadata - * @output metadata - */ - - publishDir "${publish_dir}", pattern: "*/*.csv", mode: 'copy' - - input: - path metadata - val category - - output: - path "${category}/pangolin_master.csv" - - script: - """ - mkdir -p ${category} - cp ${metadata} ${category}/pangolin_master.csv - """ -} - -workflow pangolin { - take: - in_fasta - in_metadata - pangolin_updated - main: - extract_sequences_for_pangolin(in_fasta, in_metadata, pangolin_updated) - extract_sequences_for_pangolin.out.pangolin_fasta.splitFasta( by: params.chunk_size, file: true ) - .set{ pangolin_chunks } - run_pangolin(pangolin_chunks) - run_pangolin.out.report.collectFile(newLine: true, keepHeader: true, skip: 1) - .set{ pangolin_result } - if (params.add_usher_pangolin) { - run_pangolin_usher(pangolin_chunks) - run_pangolin_usher.out.collectFile(newLine: true, keepHeader: true, skip: 1) - .set{ pangolin_usher_result } - add_pangolin_usher_to_metadata(pangolin_result, pangolin_usher_result) - post_pangolin_metadata = add_pangolin_usher_to_metadata.out - } else { - post_pangolin_metadata = pangolin_result - } - add_new_pangolin_lineages_to_metadata(extract_sequences_for_pangolin.out.metadata_with_previous, post_pangolin_metadata) - - if (params.cache_pangolin){ - cache_lineages_report(in_fasta, post_pangolin_metadata) - } - - announce_summary(extract_sequences_for_pangolin.out.pangolin_fasta, add_new_pangolin_lineages_to_metadata.out.log) - emit: - metadata = add_new_pangolin_lineages_to_metadata.out.metadata - report = post_pangolin_metadata -} - - -workflow { - uk_fasta = file(params.uk_fasta) - uk_metadata = file(params.uk_metadata) - check_for_pangolin_update() - - pangolin(uk_fasta, uk_metadata, check_for_pangolin_update.out) - publish_metadata(pangolin.out.report, "pangolin") -} diff --git a/workflows/modules/preprocess_cog_uk.nf b/workflows/modules/preprocess_cog_uk.nf deleted file mode 100644 index 9ba6caa..0000000 --- a/workflows/modules/preprocess_cog_uk.nf +++ /dev/null @@ -1,372 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir - - -process uk_strip_header_digits_and_unalign { - /** - * Strips extra header info from FASTA, removed '-' from sequence - * @input uk_fasta - * @output uk_fasta_updated - */ - - input: - path uk_fasta - - output: - path "${uk_fasta.baseName}.header_stripped.fasta" - - script: - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import re - def is_iupac(strg, search=re.compile(r'[^ACGTRYSWKMBDHVNacgtryswkmbdhvn-]').search): - return not bool(search(strg)) - - fasta_in = SeqIO.parse("${uk_fasta}", "fasta") - with open("${uk_fasta.baseName}.header_stripped.fasta", 'w') as f: - for record in fasta_in: - seq = str(record.seq).replace('-','') - seq = seq.replace('?','N') - if not is_iupac(seq): - continue - ID = record.description.split("|")[0] - f.write(">" + ID + "\\n") - f.write(seq + "\\n") - """ -} - -process uk_add_published_date_to_metadata { - /** - * Takes the MAJORA TSV of metadata and adds the published_data parameter from - * majora.pag_lookup.tsv - * @input uk_metadata, uk_pag_metadata - * @output uk_metadata_updated_date - */ - - input: - path uk_updated_metadata - path uk_metadata_pag - - output: - path "${uk_updated_metadata.baseName}.pag.csv" - - script: - """ - fastafunk add_columns \ - --in-metadata ${uk_updated_metadata} \ - --in-data ${uk_metadata_pag} \ - --index-column central_sample_id \ - --join-on central_sample_id \ - --force-overwrite \ - --new-columns published_date \ - --out-metadata "${uk_updated_metadata.baseName}.pag.csv" - """ -} - -process uk_anonymise_ids { - /** - If on or after 30th June 2023, replace central ID - for anonymous ID, if they are present. - @input uk_metadata - @output uk_metadata_anon - */ - - input: - path uk_metadata - - output: - path "${uk_metadata.baseName}.anon.tsv" - - script: - """ - #!/usr/bin/env python3 - import datetime - import csv - - anon_samp_id_date = datetime.datetime(2023, 6, 30).date() - - with open("${uk_metadata}", 'r', newline = '') as csv_in, open("${uk_metadata.baseName}.anon.tsv", 'w', newline = '') as csv_out: - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix", delimiter="\t") - writer.writeheader() - - for row in reader: - if datetime.datetime.strptime(row["published_date"], "%Y-%m-%d").date() >= anon_samp_id_date: - if row["anonymous_sample_id"]: - row["central_sample_id"] = row["anonymous_sample_id"] - writer.writerow(row) - """ -} - - -process uk_add_columns_to_metadata { - /** - * Takes the MAJORA TSV of metadata and adds/updates columns for sample_date, pillar_2, - * sequence_name, covv_accession_id, edin_epi_week, edin_epi_day and adm0 - * @input uk_metadata - * @output uk_metadata_updated - * @params uk_accessions, uk_updated_dates - */ - - input: - path uk_metadata - path uk_accessions - path uk_updated_dates - - output: - path "${uk_metadata.baseName}.updated.csv" - - script: - """ - $project_dir/../bin/add_to_uk_metadata.py \ - --in-metadata ${uk_metadata} \ - --out-metadata ${uk_metadata.baseName}.updated.csv \ - --accession-file ${uk_accessions} \ - --updated-date-file ${uk_updated_dates} - """ -} - - -process uk_filter_omitted_sequences { - /** - * Takes a FASTA and METADATA and excludes samples specified in an exclusion file - * sequence_name, covv_accession_id, edin_epi_week, edin_epi_day and adm0 - * @input uk_fasta, uk_metadata - * @output uk_fasta_updated, uk_metadata_updated - * @params uk_omissions - */ - input: - path uk_fasta - path uk_metadata - path uk_omissions - - output: - path "${uk_fasta.baseName}.omit_filtered.fa", emit: fasta - path "${uk_metadata.baseName}.omit_filtered.csv", emit: metadata - - script: - if ( params.uk_omissions ) - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import csv - - alignment = SeqIO.index("${uk_fasta}", "fasta") - - omissions = set() - with open("${uk_omissions}", "r") as f: - for line in f: - omissions.add(line.rstrip()) - - with open("${uk_metadata}", 'r', newline = '') as csv_in, \ - open("${uk_metadata.baseName}.omit_filtered.csv", 'w', newline = '') as csv_out, \ - open("${uk_fasta.baseName}.omit_filtered.fa", "w") as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - if row["central_sample_id"] in omissions: - row["why_excluded"] = "central_sample_id in omissions_file" - writer.writerow(row) - continue - - if row["fasta_header"] not in alignment: - row["why_excluded"] = "sequences was missing from input or contained non-IUPAC characters" - writer.writerow(row) - continue - - record = alignment[row["fasta_header"]] - writer.writerow(row) - fasta_out.write(">" + record.id + "\\n") - fasta_out.write(str(record.seq) + "\\n") - """ - else - """ - mv "${uk_fasta}" "${uk_fasta.baseName}.omit_filtered.fa" - mv "${uk_metadata}" "${uk_metadata.baseName}.omit_filtered.csv" - """ -} - -process uk_filter_on_sample_date { - /** - * If a time window (in days) is provided, excludes samples from FASTA and - * METADATA files which do not fall within X days of date - * @input uk_fasta, uk_metadata - * @output uk_fasta_update, uk_metadata_updated - * @params time_window, date - */ - - input: - path uk_fasta - path uk_metadata - - output: - path "${uk_fasta.baseName}.date_filtered.fa", emit: fasta - path "${uk_metadata.baseName}.date_filtered.csv", emit: metadata - - script: - if ( params.time_window && params.date) - """ - #!/usr/bin/env python3 - import datetime - from Bio import SeqIO - import csv - - indexed_fasta = SeqIO.index("${uk_fasta}", "fasta") - - window = datetime.timedelta(int("${params.time_window}")) - todays_date = datetime.datetime.strptime("${params.date}", '%Y-%m-%d').date() - - with open"${uk_metadata}", 'r', newline = '') as csv_in, \ - open("${uk_metadata.baseName}.date_filtered.csv", 'w', newline = '') as csv_out, \ - open("${uk_fasta.baseName}.date_filtered.fa", "w") as fasta_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - try: - date = datetime.datetime.strptime(row["sample_date"], '%Y-%m-%d').date() - except: - row["why_excluded"] = "no sample_date" - writer.writerow(row) - - if (todays_date - window) > date: - row["why_excluded"] = "sample_date older than %s days" %window - writer.writerow(row) - continue - - if row["fasta_header"] not in indexed_fasdta: - row["why_excluded"] = "sequences was missing from input or contained non-IUPAC characters" - writer.writerow(row) - continue - - writer.writerow(row) - - seq_rec = indexed_fasta[row["fasta_header"]] - fasta_out.write(">" + seq_rec.id + "\\n") - fasta_out.write(str(seq_rec.seq) + "\\n") - """ - else - """ - mv "${uk_fasta}" "${uk_fasta.baseName}.date_filtered.fa" - mv "${uk_metadata}" "${uk_metadata.baseName}.date_filtered.csv" - """ -} - - -process add_previous_uk_lineage_to_metadata { - /** - * Adds uk_lineage where previously assigned - * @input metadata - * @output metadata - */ - - memory { 2.GB * task.attempt + metadata.size() * 2.B } - - input: - path metadata - - output: - path "${metadata.baseName}.with_uk_lineage.csv" - - script: - if ( !params.previous_metadata ) - """ - mv ${metadata} "${metadata.baseName}.with_uk_lineage.csv" - """ - else - """ - fastafunk add_columns \ - --in-metadata ${metadata} \ - --in-data ${params.previous_metadata} \ - --index-column sequence_name \ - --join-on sequence_name \ - --new-columns uk_lineage \ - --out-metadata "${metadata.baseName}.with_uk_lineage.csv" - """ -} - - -process announce_summary { - /** - * Summarizes preprocess into JSON - * @input fastas - */ - - input: - path original - path strip_header - path filter_omitted_sequences - path filter_on_sample_date - - output: - path "announce.json" - - script: - if (params.webhook) - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Preprocessing COG input ${params.date}*\\n" >> announce.json - echo "> Number of sequences in COG input files : \$(cat ${original} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after header stripped : \$(cat ${strip_header} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after filtering omitted: \$(cat ${filter_omitted_sequences} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after filtering by sample date with time window ${params.time_window}: \$(cat ${filter_on_sample_date} | grep '>' | wc -l)\\n" >> announce.json - echo '"}' >> announce.json - - echo 'webhook ${params.webhook}' - - curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} - """ - else - """ - echo '{"text":"' > announce.json - echo "*${params.whoami}: Preprocessing COG input ${params.date}*\\n" >> announce.json - echo "> Number of sequences in COG input files : \$(cat ${original} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after header stripped : \$(cat ${strip_header} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after filtering omitted: \$(cat ${filter_omitted_sequences} | grep '>' | wc -l)\\n" >> announce.json - echo "> Number of sequences after filtering by sample date with time window ${params.time_window}: \$(cat ${filter_on_sample_date} | grep '>' | wc -l)\\n" >> announce.json - echo '"}' >> announce.json - """ -} - -uk_updated_dates = file(params.uk_updated_dates) -uk_omissions = file(params.uk_omissions) - -workflow preprocess_cog_uk { - take: - uk_fasta - uk_metadata - uk_accessions - uk_pag - main: - uk_strip_header_digits_and_unalign(uk_fasta) - uk_add_published_date_to_metadata(uk_metadata, uk_pag) - uk_anonymise_ids(uk_add_published_date_to_metadata.out) - uk_add_columns_to_metadata(uk_anonymise_ids.out, uk_accessions, uk_updated_dates) - uk_filter_omitted_sequences(uk_strip_header_digits_and_unalign.out, uk_add_columns_to_metadata.out, uk_omissions) - uk_filter_on_sample_date(uk_filter_omitted_sequences.out.fasta, uk_filter_omitted_sequences.out.metadata) - add_previous_uk_lineage_to_metadata(uk_filter_omitted_sequences.out.metadata) - announce_summary(uk_fasta, uk_strip_header_digits_and_unalign.out, uk_filter_omitted_sequences.out.fasta, uk_filter_on_sample_date.out.fasta) - emit: - fasta = uk_filter_on_sample_date.out.fasta - metadata = add_previous_uk_lineage_to_metadata.out -} - - -workflow { - uk_fasta = file(params.uk_fasta) - uk_metadata = file(params.uk_metadata) - uk_accessions = file(params.uk_accessions) - - preprocess_cog_uk(uk_fasta, - uk_metadata, - uk_accessions) -} diff --git a/workflows/modules/preprocess_gisaid.nf b/workflows/modules/preprocess_gisaid.nf deleted file mode 100644 index f37ac72..0000000 --- a/workflows/modules/preprocess_gisaid.nf +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir - - -process gisaid_process_json { - /** - * Downloads - * @input json - * @output gisaid_fasta, gisaid_metadata - * @params gisaid_omissions - */ - - input: - path json - - output: - path "gisaid.fasta", emit: fasta - path "gisaid.csv", emit: metadata - - script: - """ - datafunk process_gisaid_data \ - --input-json ${json} \ - --input-metadata False \ - --exclude-file ${gisaid_omissions} \ - --output-fasta "gisaid.fasta" \ - --output-metadata "gisaid.csv" \ - --exclude-undated - """ -} - - -process gisaid_add_columns_to_metadata { - input: - path gisaid_fasta - path gisaid_metadata - - output: - path "${gisaid_metadata.baseName}.add_metadata.csv" - - script: - """ - #!/usr/bin/env python3 - from Bio import SeqIO - import csv - - alignment = SeqIO.index("${gisaid_fasta}", "fasta") - - with open("${gisaid_metadata}", 'r', newline = '') as csv_in, \ - open("${gisaid_metadata.baseName}.add_metadata.csv", 'w', newline = '') as csv_out: - - reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix") - writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames + ['sequence_name', 'why_excluded'], delimiter=",", quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix") - writer.writeheader() - - for row in reader: - edin_header = row["edin_header"] - new_header = edin_header.split("|")[0] - row['sequence_name'] = new_header - if edin_header not in alignment: - row['why_excluded'] = "filtered during loading from JSON" - elif row["edin_epi_day"] == '': - row['why_excluded'] = "no date" - else: - row['why_excluded'] = "" - writer.writerow(row) - """ -} - - -gisaid_omissions = file(params.gisaid_omissions) - -workflow preprocess_gisaid { - take: - gisaid_json - main: - gisaid_json.splitText( by: params.chunk_size, file: true ).set{ json_chunks } - gisaid_process_json(json_chunks) - gisaid_add_columns_to_metadata(gisaid_process_json.out.fasta, gisaid_process_json.out.metadata) - gisaid_process_json.out.fasta.collectFile(newLine: true).set{ fasta_result } - gisaid_add_columns_to_metadata.out.collectFile(newLine: false, keepHeader: true, skip: 1) - .set{ metadata_result } - emit: - fasta = fasta_result - metadata = metadata_result -} - - -workflow { - gisaid_json = file(params.gisaid_json) - - preprocess_gisaid(gisaid_json) -} diff --git a/workflows/modules/publish_all.nf b/workflows/modules/publish_all.nf deleted file mode 100644 index 90e8ee0..0000000 --- a/workflows/modules/publish_all.nf +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -project_dir = projectDir -publish_dir = file(params.publish_dir) -publish_dev = file(params.publish_dev) - - -process combine_cog_gisaid { - /** - * Combines FASTA and METADATA for COG-UK and GISAID - * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata - * @output cog_gisaid_fasta, cog_gisaid_metadata - */ - - publishDir "${publish_dev}/cog_gisaid", pattern: "*.fa", mode: 'copy' - publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_master.csv"} - - input: - path uk_fasta - path uk_metadata - path gisaid_fasta - path gisaid_metadata - - output: - path "cog_gisaid.fa", emit: fasta - path "cog_gisaid.csv", emit: metadata - - script: - """ - fastafunk fetch \ - --in-fasta ${uk_fasta} \ - --in-metadata ${uk_metadata} \ - --index-column sequence_name \ - --filter-column fasta_header covv_accession_id central_sample_id biosample_source_id secondary_identifier root_sample_id source_id \ - sequence_name sample_date safe_sample_date epi_week epi_day collection_date received_date published_date \ - country adm1 adm1_raw adm1_UK adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ - is_uk is_cog_uk \ - submission_org_code submission_user collection_pillar is_pillar_2 is_surveillance is_community is_hcw \ - is_travel_history travel_history \ - lineage lineages_version lineage_conflict lineage_ambiguity_score scorpio_call scorpio_support scorpio_conflict \ - usher_lineage usher_lineages_version \ - source_age source_sex sample_type_collected sample_type_received swab_site \ - ct_n_ct_value ct_n_test_kit ct_n_test_platform ct_n_test_target \ - unmapped_genome_completeness duplicate why_excluded nucleotide_mutations \ - uk_lineage microreact_lineage del_lineage del_introduction phylotype \ - --where-column epi_week=edin_epi_week epi_day=edin_epi_day country=adm0 lineage_support=probability lineages_version=pangoLEARN_version adm1_UK=adm1_raw published_date=sequencing_submission_date \ - --out-fasta "intermediate_cog.fa" \ - --out-metadata "intermediate_cog.csv" \ - --restrict --low-memory - - fastafunk fetch \ - --in-fasta ${gisaid_fasta} \ - --in-metadata ${gisaid_metadata} \ - --index-column sequence_name \ - --filter-column fasta_header covv_accession_id central_sample_id biosample_source_id secondary_identifier root_sample_id source_id \ - sequence_name sample_date safe_sample_date epi_week epi_day collection_date received_date published_date \ - country adm1 adm1_raw adm1_UK adm2 outer_postcode adm2_raw adm2_source NUTS1 region latitude longitude location safe_location utla utla_code suggested_adm2_grouping \ - is_uk is_cog_uk \ - submission_org_code submission_user collection_pillar is_pillar_2 is_surveillance is_community is_hcw \ - is_travel_history travel_history \ - lineage lineages_version lineage_conflict lineage_ambiguity_score scorpio_call scorpio_support scorpio_conflict \ - usher_lineage usher_lineages_version \ - source_age source_sex sample_type_collected sample_type_received swab_site \ - ct_n_ct_value ct_n_test_kit ct_n_test_platform ct_n_test_target \ - unmapped_genome_completeness duplicate why_excluded nucleotide_mutations \ - uk_lineage microreact_lineage del_lineage del_introduction phylotype \ - --where-column adm1=edin_admin_1 travel_history=edin_travel published_date=covv_subm_date\ - --out-fasta "intermediate_gisaid.fa" \ - --out-metadata "intermediate_gisaid.csv" \ - --restrict --low-memory - - cat intermediate_cog.fa intermediate_gisaid.fa > cog_gisaid.fa - cat intermediate_cog.csv > cog_gisaid.csv - tail -n+2 intermediate_gisaid.csv >> cog_gisaid.csv - - head -n1 intermediate_cog.csv > head_cog.txt - head -n1 intermediate_gisaid.csv > head_gisaid.txt - cmp --silent head_cog.txt head_gisaid.txt || exit 1 - """ -} - - -process combine_mutations { - /** - * Combines FASTA and mutation metadata for COG-UK and GISAID - * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata - * @output cog_gisaid_fasta, cog_gisaid_metadata - */ - - publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_mutations.csv"} - - input: - path uk_mutations - path gisaid_mutations - - output: - path "cog_gisaid_mutations.csv" - - script: - """ - fastafunk merge \ - --in-metadata ${uk_mutations} ${gisaid_mutations} \ - --out-metadata "cog_gisaid_mutations.csv" \ - --index-column "sequence_name" - """ -} - -process combine_constellations { - /** - * Combines FASTA and constellation metadata for COG-UK and GISAID - * @input uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata - * @output cog_gisaid_fasta, cog_gisaid_metadata - */ - - publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_constellations.csv"} - - input: - path uk_constellations - path gisaid_constellations - - output: - path "cog_gisaid_constellations.csv" - - script: - """ - fastafunk merge \ - --in-metadata ${uk_constellations} ${gisaid_constellations} \ - --out-metadata "cog_gisaid_constellations.csv" \ - --index-column "sequence_name" - """ -} - -process combine_updown { - /** - * Combines updown metadata for COG-UK and GISAID - * @input uk_updown gisaid_updown - * @output cog_gisaid_updown - */ - - publishDir "${publish_dev}/cog_gisaid", pattern: "*.csv", mode: 'copy', saveAs: {"cog_gisaid_updown.csv"} - - input: - path uk_updown - path gisaid_updown - - output: - path "cog_gisaid_updown.csv" - - script: - """ - cp ${uk_updown} tmp.csv - tail -n+1 ${gisaid_updown} >> tmp.csv - grep -v ",,,," tmp.csv > "cog_gisaid_updown.csv" - """ -} - - -process split_recipes { - input: - path recipes - - output: - path "*.json" - - script: - """ - #!/usr/bin/env python3 - import json - i = 0 - - with open("${recipes}", 'r') as f: - recipes = json.load(f) - - for d in recipes: - for entry in recipes[d]: - new_recipes = {d:[entry]} - with open("%i.json" %i, 'w') as handle: - json.dump(new_recipes,handle) - i += 1 - """ -} - - -process publish_cog_global_recipes { - /** - * Publishes subsets of combined FASTA and METADATA for COG-UK and GISAID - * @input uk_unaligned_fasta, uk_aligned_fasta, uk_trimmed_fasta, combined_fasta, - * uk_metadata, combined_metadata, uk_mutations, combined_mutations - * @params publish_recipes.json - * @output many - */ - - publishDir "${publish_dir}/", pattern: "*/*.*", mode: 'copy', overwrite: false - publishDir "${publish_dir}/", pattern: "README", mode: 'copy', overwrite: false - - memory { 1.GB * task.attempt + combined_metadata.size() * 4.B } - errorStrategy = { 'retry' } - maxRetries 3 - - input: - tuple path(uk_unaligned_fasta),path(uk_aligned_fasta),path(uk_trimmed_fasta),path(combined_fasta),path(uk_metadata),path(combined_metadata),path(combined_mutations),path(combined_constellations),path(combined_updown),path(recipe) - - output: - path "${recipe.baseName}.done.txt", emit: flag - path "README", emit: readme - path "public/cog_${params.date}_all.fa", optional: true, emit: fasta - path "public/cog_${params.date}_metadata.csv", optional: true, emit: metadata - path "public/cog_${params.date}_alignment.fa", optional: true, emit: alignment - path "public/cog_${params.date}_unmasked_alignment.fa", optional: true, emit: unmasked_alignment - path "*/cog_*.*", emit: all - - script: - """ - cp $project_dir/../resources/publish_readme.txt README - - $project_dir/../bin/publish_from_config.py \ - --unaligned_fasta ${uk_unaligned_fasta} \ - --aligned_fasta ${uk_aligned_fasta} \ - --trimmed_fasta ${uk_trimmed_fasta} \ - --cog_global_fasta ${combined_fasta} \ - --cog_metadata ${uk_metadata} \ - --cog_global_metadata ${combined_metadata} \ - --mutations ${combined_mutations} \ - --constellations ${combined_constellations} \ - --updown ${combined_updown} \ - --recipes ${recipe} \ - --date ${params.date} - touch "${recipe.baseName}.done.txt" - """ -} - -process publish_s3 { - /** - * Publishes public files to s3 - * @input fasta, metadata, aligment, unmasked_alignment - */ - publishDir "${publish_dev}/", pattern: "s3dir", mode: 'copy' - - input: - path fasta - path metadata - path alignment - path unmasked_alignment - - output: - path s3dir - - - script: - """ - mkdir -p s3dir - cp ${fasta} s3dir/cog_all.fasta - cp ${metadata} s3dir/cog_metadata.csv - cp ${alignment} s3dir/cog_alignment.fasta - cp ${unmasked_alignment} s3dir/cog_unmasked_alignment.fasta - """ -} - - -process publish_gisaid_recipes { - /** - * Publishes subsets of combined FASTA and METADATA for COG-UK and GISAID - * @input gisaid_unaligned_fasta, gisaid_aligned_fasta, gisaid_trimmed_fasta, combined_fasta, - * gisaid_metadata, combined_metadata, gisaid_mutations, combined_mutations - * @params publish_recipes.json - * @output many - */ - - publishDir "${publish_dir}/", pattern: "*/*.*", mode: 'copy', overwrite: false - - memory { 1.GB * task.attempt + gisaid_metadata.size() * 8.B } - errorStrategy = { 'retry' } - maxRetries 3 - - input: - tuple path(gisaid_fasta),path(gisaid_metadata),path(gisaid_mutations),path(gisaid_constellations),path(gisaid_updown),path(recipe) - - output: - path "*/gisaid_*.*", emit: all - path "*/gisaid_*_global_alignment.fa", optional: true, emit: fasta - path "*/gisaid_*_global_metadata.csv", optional: true, emit: metadata - path "*/gisaid_*_global_mutations.csv", optional: true, emit: mutations - path "*/gisaid_*_global_constellations.csv", optional: true, emit: constellations - path "*/gisaid_*_global_updown.csv", optional: true, emit: updown - - script: - """ - $project_dir/../bin/publish_from_config.py \ - --recipes ${recipe} \ - --date ${params.date} \ - --gisaid_fasta ${gisaid_fasta} \ - --gisaid_metadata ${gisaid_metadata} \ - --mutations ${gisaid_mutations} \ - --constellations ${gisaid_constellations} \ - --updown ${gisaid_updown} - """ -} - - -process announce_to_webhook { - input: - file published_files - val name - - script: - if (params.webhook) - """ - echo '{"text":"' > announce.json - echo "*${name} Complete*\\n" >> announce.json - echo "> Dev outputs in : ${publish_dev}\\n" >> announce.json - echo "> Publishable outputs in : ${publish_dir}\\n" >> announce.json - echo '"}' >> announce.json - echo 'webhook ${params.webhook}' - - curl -X POST -H "Content-type: application/json" -d @announce.json ${params.webhook} - """ - else - """ - touch "announce.json" - """ -} - - -geography_utils = file(params.uk_geography) -cog_global_recipes = file(params.publish_cog_global_recipes) -gisaid_recipes = file(params.publish_gisaid_recipes) - - -workflow publish_cog_global { - take: - uk_unaligned_fasta - uk_aligned_fasta - uk_fasta - uk_metadata - uk_mutations - uk_constellations - uk_updown - gisaid_fasta - gisaid_metadata - gisaid_mutations - gisaid_constellations - gisaid_updown - main: - combine_cog_gisaid(uk_fasta, uk_metadata, gisaid_fasta, gisaid_metadata) - combine_mutations(uk_mutations, gisaid_mutations) - combine_constellations(uk_constellations, gisaid_constellations) - combine_updown(uk_updown, gisaid_updown) - split_recipes(cog_global_recipes) - recipe_ch = split_recipes.out.flatten() - uk_unaligned_fasta.combine(uk_aligned_fasta) - .combine(uk_fasta) - .combine(combine_cog_gisaid.out.fasta) - .combine(uk_metadata) - .combine(combine_cog_gisaid.out.metadata) - .combine(combine_mutations.out) - .combine(combine_constellations.out) - .combine(combine_updown.out) - .combine(recipe_ch) - .set{ publish_input_ch } - publish_cog_global_recipes(publish_input_ch) - outputs_ch = publish_cog_global_recipes.out.flag.collect() - announce_to_webhook(outputs_ch, "${params.whoami}") - if ( params.s3 ) - { - publish_s3(publish_cog_global_recipes.out.fasta, publish_cog_global_recipes.out.metadata, publish_cog_global_recipes.out.alignment, publish_cog_global_recipes.out.unmasked_alignment) - } -} - - -workflow publish_gisaid { - take: - gisaid_fasta - gisaid_metadata - gisaid_mutations - gisaid_constellations - gisaid_updown - main: - split_recipes(gisaid_recipes) - recipe_ch = split_recipes.out.flatten() - gisaid_fasta.combine(gisaid_metadata) - .combine(gisaid_mutations) - .combine(gisaid_constellations) - .combine(gisaid_updown) - .combine(recipe_ch) - .set{ publish_input_ch } - publish_gisaid_recipes(publish_input_ch) - outputs_ch = publish_gisaid_recipes.out.all.collect() - emit: - fasta = publish_gisaid_recipes.out.fasta - metadata = publish_gisaid_recipes.out.metadata - mutations = publish_gisaid_recipes.out.mutations - constellations = publish_gisaid_recipes.out.constellations - updown = publish_gisaid_recipes.out.updown - published = outputs_ch -} - - -workflow { - uk_unaligned_fasta = Channel.fromPath(params.uk_unaligned_fasta) - uk_aligned_fasta = Channel.fromPath(params.uk_aligned_fasta) - uk_fasta = Channel.fromPath(params.uk_fasta) - uk_metadata = Channel.fromPath(params.uk_metadata) - uk_mutations = Channel.fromPath(params.uk_mutations) - uk_constellations = Channel.fromPath(params.uk_constellations) - uk_updown = Channel.fromPath(params.uk_updown) - - gisaid_fasta = Channel.fromPath(params.gisaid_fasta) - gisaid_metadata = Channel.fromPath(params.gisaid_metadata) - gisaid_mutations = Channel.fromPath(params.gisaid_mutations) - gisaid_constellations = Channel.fromPath(params.gisaid_constellations) - gisaid_updown = Channel.fromPath(params.gisaid_updown) - - publish_all(uk_unaligned_fasta, - uk_aligned_fasta, - uk_fasta, - uk_metadata, - uk_mutations, - uk_constellations, - uk_updown, - gisaid_fasta, - gisaid_metadata, - gisaid_mutations, - gisaid_constellations, - gisaid_updown) -} diff --git a/workflows/modules/start.nf b/workflows/modules/start.nf deleted file mode 100644 index 46931c2..0000000 --- a/workflows/modules/start.nf +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 -publish_dev = file(params.publish_dev) - -process get_git_hash { - /** - * Gets git commit - */ - publishDir "${publish_dev}", mode: 'copy', overwrite: true - - input: - path commit_file - - output: - path "${commit_file}" - - script: - """ - echo "\n Git hash \t = \t \$( git rev-parse HEAD) \n\n" >> ${commit_file} - """ -} - -workflow start { - params_file = file("${workDir}/input_params.txt") - params_file << "\n#######################################################################################\n\n" - - printMapClosure = { key, value -> - params_file << "$key = $value\n" - } - params.each(printMapClosure) - get_git_hash(params_file) -} diff --git a/workflows/nextflow.config b/workflows/nextflow.config deleted file mode 100644 index 0909731..0000000 --- a/workflows/nextflow.config +++ /dev/null @@ -1,32 +0,0 @@ -// Global default params, used in configs -workDir = "analysis" - -params { - - // Boilerplate options - help = false - - // cache option makes it a bit easier to set conda or singularity cacheDir - cache = '' - -} - -includeConfig 'config/base.config' - -process { - errorStrategy = { 'retry' } - maxRetries = 5 - - withLabel: retry_increasing_mem { - errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - memory = {4.GB * task.attempt} - maxRetries = 5 - } -} - -profiles { - slurm { - process.executor = 'slurm' - process.clusterOptions='--account=lomannj-covid-19-realtime-epidemiology --qos=lomannj --time 600:0 --nodes 1' - } -} diff --git a/workflows/resources/AAs.csv b/workflows/resources/AAs.csv deleted file mode 100644 index 6bab385..0000000 --- a/workflows/resources/AAs.csv +++ /dev/null @@ -1,10 +0,0 @@ -t1001i,3266 -p323l,14407 -a222v,22226 -n439k,22877 -y453f,22919 -e484k,23012 -n501y,23063 -d614g,23402 -p681h,23603 -q27stop,27972 diff --git a/workflows/resources/MN908947.fa b/workflows/resources/MN908947.fa deleted file mode 100644 index e1cfd92..0000000 --- a/workflows/resources/MN908947.fa +++ /dev/null @@ -1,429 +0,0 @@ ->MN908947.3 -ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA -CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC -TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG -TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC -CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC -GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG -CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT -GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC -GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT -TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA -GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG -TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG -CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG -TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG -CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA -ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA -CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC -CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA -GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT -ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG -GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG -CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA -CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA -ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA -GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT -TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG -GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG -TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC -GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG -ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG -GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT -AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA -TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT -AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA -GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC -TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT -AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA -GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT -ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA -GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT -GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA -ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC -ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA -TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG -AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT -TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA -CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC -AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT -AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA -GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA -CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG -TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT -GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT -TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA -TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT -GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA -AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC -TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA -GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG -ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT -GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT -GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA -TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC -AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA -TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA -CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA -TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT -TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG -AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA -ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC -CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA -AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA -ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT -AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG -ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA -TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA -ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA -ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT -GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT -GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT -TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC -ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT -GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG -ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG -TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG -TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC -CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA -CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT -GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC -CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG -TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA -ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA -AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA -TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT -ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG -CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC -AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA -TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG -CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA -TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC -TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG -GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT -TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA -TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT -ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC -TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC -TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT -GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG -GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT -GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA -GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA -TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC -AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT -GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT -AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT -AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG -AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT -TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT -ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG -GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT -ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG -AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG -CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT -TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA -ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA -ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC -ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC -ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC -CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT -TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT -GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC -TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC -AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA -GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG -GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA -CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC -CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT -ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT -CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG -TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG -CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC -GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC -TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC -TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC -ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT -GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG -ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG -ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG -TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT -ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG -TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT -GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA -CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT -TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA -ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG -CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA -TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA -ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC -AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC -TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC -ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA -TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT -AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG -ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT -CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG -TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC -TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG -GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA -GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA -GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC -TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA -AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA -GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA -GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA -TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA -GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG -AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA -TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA -ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT -TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG -TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT -GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA -CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT -TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC -TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT -ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT -ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT -GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT -GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG -TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA -AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG -TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA -GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA -CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA -ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT -GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC -CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA -ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC -ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG -ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT -AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA -GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG -TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT -TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA -AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG -ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG -ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG -CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT -ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC -AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT -GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC -AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG -ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT -AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC -TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC -TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC -TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG -GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG -TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT -AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA -GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC -AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC -ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC -AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC -ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT -GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA -TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC -AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC -GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA -TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA -CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT -AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG -TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT -ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT -GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT -GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA -TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT -GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG -AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT -TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT -AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC -GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT -AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC -TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG -GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT -GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT -AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT -TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA -AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC -ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT -TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC -TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA -GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC -AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA -TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC -TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA -ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT -GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC -TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA -CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT -CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA -GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG -TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA -TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA -CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA -GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC -TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT -TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA -TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA -TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT -AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA -AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA -CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT -GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG -TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG -AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC -ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC -CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT -AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT -TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA -ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG -ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA -GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA -AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA -CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA -CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT -TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC -CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG -AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA -AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT -AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA -CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA -CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT -ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG -ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA -AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT -ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC -GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA -TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC -GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT -TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA -ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT -AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG -GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG -GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG -GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA -AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG -GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA -CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA -ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA -GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG -TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC -TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC -CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT -TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC -GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC -AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA -TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT -TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA -GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA -ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT -GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT -GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG -TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC -ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT -GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG -ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC -TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA -GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT -TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT -TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC -AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC -TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA -GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC -CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA -CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC -TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT -CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG -GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC -CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA -ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA -TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC -AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA -TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT -GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC -TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG -ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG -GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA -AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA -CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA -TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG -TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT -ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA -TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA -GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT -TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA -CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC -TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA -GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG -CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC -ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT -ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG -ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA -ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC -GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT -TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT -GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG -GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT -AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT -CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA -CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG -GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA -ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC -CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT -TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC -TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT -TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT -GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT -CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA -TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC -CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA -AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT -AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC -ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC -TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT -GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA -GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG -ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG -CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC -TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA -AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC -CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA -GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA -TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT -TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT -GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT -ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG -CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA -GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG -TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC -GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA -TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT -GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA -AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG -ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG -TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT -GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC -CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG -TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT -GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA -AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC -ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT -AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA -ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG -TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG -CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC -AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA -ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG -TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC -TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC -TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT -TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG -CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT -GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT -TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC -GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT -TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA -AAAAAAAAAAAAA diff --git a/workflows/resources/MN908947.gb b/workflows/resources/MN908947.gb deleted file mode 100644 index 261614d..0000000 --- a/workflows/resources/MN908947.gb +++ /dev/null @@ -1,798 +0,0 @@ -LOCUS MN908947 29903 bp ss-RNA linear VRL 11-FEB-2020 -DEFINITION Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, - complete genome. -ACCESSION MN908947 -VERSION MN908947.3 -KEYWORDS . -SOURCE Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) - ORGANISM Severe acute respiratory syndrome coronavirus 2 - Viruses; Riboviria; Nidovirales; Cornidovirineae; Coronaviridae; - Orthocoronavirinae; Betacoronavirus; Sarbecovirus. -REFERENCE 1 (bases 1 to 29903) - AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., - Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., - Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C. and Zhang,Y.Z. - TITLE A new coronavirus associated with human respiratory disease in - China - JOURNAL Nature (2020) In press - PUBMED 32015508 - REMARK Publication Status: Available-Online prior to print -REFERENCE 2 (bases 1 to 29903) - AUTHORS Wu,F., Zhao,S., Yu,B., Chen,Y.-M., Wang,W., Hu,Y., Song,Z.-G., - Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Yuan,M.L., Zhang,Y.-L., - Dai,F.-H., Liu,Y., Wang,Q.-M., Zheng,J.-J., Xu,L., Holmes,E.C. and - Zhang,Y.-Z. - TITLE Direct Submission - JOURNAL Submitted (05-JAN-2020) Shanghai Public Health Clinical Center & - School of Public Health, Fudan University, Shanghai, China -COMMENT On Jan 17, 2020 this sequence version replaced MN908947.2. - - ##Assembly-Data-START## - Assembly Method :: Megahit v. V1.1.3 - Sequencing Technology :: Illumina - ##Assembly-Data-END## -FEATURES Location/Qualifiers - source 1..29903 - /organism="Severe acute respiratory syndrome coronavirus - 2" - /mol_type="genomic RNA" - /isolate="Wuhan-Hu-1" - /host="Homo sapiens" - /db_xref="taxon:2697049" - /country="China" - /collection_date="Dec-2019" - 5'UTR 1..265 - gene 266..21555 - /gene="orf1ab" - CDS join(266..13468,13468..21555) - /gene="orf1ab" - /ribosomal_slippage - /note="pp1ab; translated by -1 ribosomal frameshift" - /codon_start=1 - /product="orf1ab polyprotein" - /protein_id="QHD43415.1" - /translation="MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQ - HLKDGTCGLVEVEKGVLPQLEQPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGE - TLGVLVPHVGEIPVAYRKVLLRKNGNKGAGGHSYGADLKSFDLGDELGTDPYEDFQEN - WNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQ - LDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFP - LNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTG - DFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESG - LKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNL - LEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGN - FKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAA - ITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKL - KPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLV - NKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEII - FLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEK - YCALAPNMMVTNNTFTLKGGAPTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEK - CSAYTVELGTEVNEFACVVADAVIKTLQPVSELLTPLGIDLDEWSMATYYLFDESGEF - KLASHMYCSFYPPDEDEEEGDCEEEEFEPSTQYEYGTEDDYQGKPLEFGATSAALQPE - EEQEEDWLDDDSQQTVGQQDGSEDNQTTTIQTIVEVQPQLEMELTPVVQTIEVNSFSG - YLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNKATNNAMQVESD - DYIATNGPLKVGGSCVLSGHNLAKHCLHVVGPNVNKGEDIQLLKSAYENFNQHEVLLA - PLLSAGIFGADPIHSLRVCVDTVRTNVYLAVFDKNLYDKLVSSFLEMKSEKQVEQKIA - EIPKEEVKPFITESKPSVEQRKQDDKKIKACVEEVTTTLEETKFLTENLLLYIDINGN - LHPDSATLVSDIDITFLKKDAPYIVGDVVQEGVLTAVVIPTKKAGGTTEMLAKALRKV - PTDNYITTYPGQGLNGYTVEEAKTVLKKCKSAFYILPSIISNEKQEILGTVSWNLREM - LAHAEETRKLMPVCVETKAIVSTIQRKYKGIKIQEGVVDYGARFYFYTSKTTVASLIN - TLNDLNETLVTMPLGYVTHGLNLEEAARYMRSLKVPATVSVSSPDAVTAYNGYLTSSS - KTPEEHFIETISLAGSYKDWSYSGQSTQLGIEFLKRGDKSVYYTSNPTTFHLDGEVIT - FDNLKTLLSLREVRTIKVFTTVDNINLHTQVVDMSMTYGQQFGPTYLDGADVTKIKPH - NSHEGKTFYVLPNDDTLRVEAFEYYHTTDPSFLGRYMSALNHTKKWKYPQVNGLTSIK - WADNNCYLATALLTLQQIELKFNPPALQDAYYRARAGEAANFCALILAYCNKTVGELG - DVRETMSYLFQHANLDSCKRVLNVVCKTCGQQQTTLKGVEAVMYMGTLSYEQFKKGVQ - IPCTCGKQATKYLVQQESPFVMMSAPPAQYELKHGTFTCASEYTGNYQCGHYKHITSK - ETLYCIDGALLTKSSEYKGPITDVFYKENSYTTTIKPVTYKLDGVVCTEIDPKLDNYY - KKDNSYFTEQPIDLVPNQPYPNASFDNFKFVCDNIKFADDLNQLTGYKKPASRELKVT - FFPDLNGDVVAIDYKHYTPSFKKGAKLLHKPIVWHVNNATNKATYKPNTWCIRCLWST - KPVETSNSFDVLKSEDAQGMDNLACEDLKPVSEEVVENPTIQKDVLECNVKTTEVVGD - IILKPANNSLKITEEVGHTDLMAAYVDNSSLTIKKPNELSRVLGLKTLATHGLAAVNS - VPWDTIANYAKPFLNKVVSTTTNIVTRCLNRVCTNYMPYFFTLLLQLCTFTRSTNSRI - KASMPTTIAKNTVKSVGKFCLEASFNYLKSPNFSKLINIIIWFLLLSVCLGSLIYSTA - ALGVLMSNLGMPSYCTGYREGYLNSTNVTIATYCTGSIPCSVCLSGLDSLDTYPSLET - IQITISSFKWDLTAFGLVAEWFLAYILFTRFFYVLGLAAIMQLFFSYFAVHFISNSWL - MWLIINLVQMAPISAMVRMYIFFASFYYVWKSYVHVVDGCNSSTCMMCYKRNRATRVE - CTTIVNGVRRSFYVYANGGKGFCKLHNWNCVNCDTFCAGSTFISDEVARDLSLQFKRP - INPTDQSSYIVDSVTVKNGSIHLYFDKAGQKTYERHSLSHFVNLDNLRANNTKGSLPI - NVIVFDGKSKCEESSAKSASVYYSQLMCQPILLLDQALVSDVGDSAEVAVKMFDAYVN - TFSSTFNVPMEKLKTLVATAEAELAKNVSLDNVLSTFISAARQGFVDSDVETKDVVEC - LKLSHQSDIEVTGDSCNNYMLTYNKVENMTPRDLGACIDCSARHINAQVAKSHNIALI - WNVKDFMSLSEQLRKQIRSAAKKNNLPFKLTCATTRQVVNVVTTKIALKGGKIVNNWL - KQLIKVTLVFLFVAAIFYLITPVHVMSKHTDFSSEIIGYKAIDGGVTRDIASTDTCFA - NKHADFDTWFSQRGGSYTNDKACPLIAAVITREVGFVVPGLPGTILRTTNGDFLHFLP - RVFSAVGNICYTPSKLIEYTDFATSACVLAAECTIFKDASGKPVPYCYDTNVLEGSVA - YESLRPDTRYVLMDGSIIQFPNTYLEGSVRVVTTFDSEYCRHGTCERSEAGVCVSTSG - RWVLNNDYYRSLPGVFCGVDAVNLLTNMFTPLIQPIGALDISASIVAGGIVAIVVTCL - AYYFMRFRRAFGEYSHVVAFNTLLFLMSFTVLCLTPVYSFLPGVYSVIYLYLTFYLTN - DVSFLAHIQWMVMFTPLVPFWITIAYIICISTKHFYWFFSNYLKRRVVFNGVSFSTFE - EAALCTFLLNKEMYLKLRSDVLLPLTQYNRYLALYNKYKYFSGAMDTTSYREAACCHL - AKALNDFSNSGSDVLYQPPQTSITSAVLQSGFRKMAFPSGKVEGCMVQVTCGTTTLNG - LWLDDVVYCPRHVICTSEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVL - KLKVDTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIKGSFLNGSC - GSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTTITVN - VLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAV - LDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQSAVKRTIKGTHHW - LLLTILTSLLVLVQSTQWSLFFFLYENAFLPFAMGIIAMSAFAMMFVKHKHAFLCLFL - LPSLATVAYFNMVYMPASWVMRIMTWLDMVDTSLSGFKLKDCVMYASAVVLLILMTAR - TVYDDGARRVWTLMNVLTLVYKVYYGNALDQAISMWALIISVTSNYSGVVTTVMFLAR - GIVFMCVEYCPIFFITGNTLQCIMLVYCFLGYFCTCYFGLFCLLNRYFRLTLGVYDYL - VSTQEFRYMNSQGLLPPKNSIDAFKLNIKLLGVGGKPCIKVATVQSKMSDVKCTSVVL - LSVLQQLRVESSSKLWAQCVQLHNDILLAKDTTEAFEKMVSLLSVLLSMQGAVDINKL - CEEMLDNRATLQAIASEFSSLPSYAAFATAQEAYEQAVANGDSEVVLKKLKKSLNVAK - SEFDRDAAMQRKLEKMADQAMTQMYKQARSEDKRAKVTSAMQTMLFTMLRKLDNDALN - NIINNARDGCVPLNIIPLTTAAKLMVVIPDYNTYKNTCDGTTFTYASALWEIQQVVDA - DSKIVQLSEISMDNSPNLAWPLIVTALRANSAVKLQNNELSPVALRQMSCAAGTTQTA - CTDDNALAYYNTTKGGRFVLALLSDLQDLKWARFPKSDGTGTIYTELEPPCRFVTDTP - KGPKVKYLYFIKGLNNLNRGMVLGSLAATVRLQAGNATEVPANSTVLSFCAFAVDAAK - AYKDYLASGGQPITNCVKMLCTHTGTGQAITVTPEANMDQESFGGASCCLYCRCHIDH - PNPKGFCDLKGKYVQIPTTCANDPVGFTLKNTVCTVCGMWKGYGCSCDQLREPMLQSA - DAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFLKTNCCRFQEKD - EDDNLIDSYFVVKRHTFSNYQHEETIYNLLKDCPAVAKHDFFKFRIDGDMVPHISRQR - LTKYTMADLVYALRHFDEGNCDTLKEILVTYNCCDDDYFNKKDWYDFVENPDILRVYA - NLGERVRQALLKTVQFCDAMRNAGIVGVLTLDNQDLNGNWYDFGDFIQTTPGSGVPVV - DSYYSLLMPILTLTRALTAESHVDTDLTKPYIKWDLLKYDFTEERLKLFDRYFKYWDQ - TYHPNCVNCLDDRCILHCANFNVLFSTVFPPTSFGPLVRKIFVDGVPFVVSTGYHFRE - LGVVHNQDVNLHSSRLSFKELLVYAADPAMHAASGNLLLDKRTTCFSVAALTNNVAFQ - TVKPGNFNKDFYDFAVSKGFFKEGSSVELKHFFFAQDGNAAISDYDYYRYNLPTMCDI - RQLLFVVEVVDKYFDCYDGGCINANQVIVNNLDKSAGFPFNKWGKARLYYDSMSYEDQ - DALFAYTKRNVIPTITQMNLKYAISAKNRARTVAGVSICSTMTNRQFHQKLLKSIAAT - RGATVVIGTSKFYGGWHNMLKTVYSDVENPHLMGWDYPKCDRAMPNMLRIMASLVLAR - KHTTCCSLSHRFYRLANECAQVLSEMVMCGGSLYVKPGGTSSGDATTAYANSVFNICQ - AVTANVNALLSTDGNKIADKYVRNLQHRLYECLYRNRDVDTDFVNEFYAYLRKHFSMM - ILSDDAVVCFNSTYASQGLVASIKNFKSVLYYQNNVFMSEAKCWTETDLTKGPHEFCS - QHTMLVKQGDDYVYLPYPDPSRILGAGCFVDDIVKTDGTLMIERFVSLAIDAYPLTKH - PNQEYADVFHLYLQYIRKLHDELTGHMLDMYSVMLTNDNTSRYWEPEFYEAMYTPHTV - LQAVGACVLCNSQTSLRCGACIRRPFLCCKCCYDHVISTSHKLVLSVNPYVCNAPGCD - VTDVTQLYLGGMSYYCKSHKPPISFPLCANGQVFGLYKNTCVGSDNVTDFNAIATCDW - TNAGDYILANTCTERLKLFAAETLKATEETFKLSYGIATVREVLSDRELHLSWEVGKP - RPPLNRNYVFTGYRVTKNSKVQIGEYTFEKGDYGDAVVYRGTTTYKLNVGDYFVLTSH - TVMPLSAPTLVPQEHYVRITGLYPTLNISDEFSSNVANYQKVGMQKYSTLQGPPGTGK - SHFAIGLALYYPSARIVYTACSHAAVDALCEKALKYLPIDKCSRIIPARARVECFDKF - KVNSTLEQYVFCTVNALPETTADIVVFDEISMATNYDLSVVNARLRAKHYVYIGDPAQ - LPAPRTLLTKGTLEPEYFNSVCRLMKTIGPDMFLGTCRRCPAEIVDTVSALVYDNKLK - AHKDKSAQCFKMFYKGVITHDVSSAINRPQIGVVREFLTRNPAWRKAVFISPYNSQNA - VASKILGLPTQTVDSSQGSEYDYVIFTQTTETAHSCNVNRFNVAITRAKVGILCIMSD - RDLYDKLQFTSLEIPRRNVATLQAENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKT - EGLCVDIPGIPKDMTYRRLISMMGFKMNYQVNGYPNMFITREEAIRHVRAWIGFDVEG - CHATREAVGTNLPLQLGFSTGVNLVAVPTGYVDTPNNTDFSRVSAKPPPGDQFKHLIP - LMYKGLPWNVVRIKIVQMLSDTLKNLSDRVVFVLWAHGFELTSMKYFVKIGPERTCCL - CDRRATCFSTASDTYACWHHSIGFDYVYNPFMIDVQQWGFTGNLQSNHDLYCQVHGNA - HVASCDAIMTRCLAVHECFVKRVDWTIEYPIIGDELKINAACRKVQHMVVKAALLADK - FPVLHDIGNPKAIKCVPQADVEWKFYDAQPCSDKAYKIEELFYSYATHSDKFTDGVCL - FWNCNVDRYPANSIVCRFDTRVLSNLNLPGCDGGSLYVNKHAFHTPAFDKSAFVNLKQ - LPFFYYSDSPCESHGKQVVSDIDYVPLKSATCITRCNLGGAVCRHHANEYRLYLDAYN - MMISAGFSLWVYKQFDTYNLWNTFTRLQSLENVAFNVVNKGHFDGQQGEVPVSIINNT - VYTKVDGVDVELFENKTTLPVNVAFELWAKRNIKPVPEVKILNNLGVDIAANTVIWDY - KRDAPAHISTIGVCSMTDIAKKPTETICAPLTVFFDGRVDGQVDLFRNARNGVLITEG - SVKGLQPSVGPKQASLNGVTLIGEAVKTQFNYYKKVDGVVQQLPETYFTQSRNLQEFK - PRSQMEIDFLELAMDEFIERYKLEGYAFEHIVYGDFSHSQLGGLHLLIGLAKRFKESP - FELEDFIPMDSTVKNYFITDAQTGSSKCVCSVIDLLLDDFVEIIKSQDLSVVSKVVKV - TIDYTEISFMLWCKDGHVETFYPKLQSSQAWQPGVAMPNLYKMQRMLLEKCDLQNYGD - SATLPKGIMMNVAKYTQLCQYLNTLTLAVPYNMRVIHFGAGSDKGVAPGTAVLRQWLP - TGTLLVDSDLNDFVSDADSTLIGDCATVHTANKWDLIISDMYDPKTKNVTKENDSKEG - FFTYICGFIQQKLALGGSVAIKITEHSWNADLYKLMGHFAWWTAFVTNVNASSSEAFL - IGCNYLGKPREQIDGYVMHANYIFWRNTNPIQLSSYSLFDMSKFPLKLRGTAVMSLKE - GQINDMILSLLSKGRLIIRENNRVVISSDVLVNN" - gene 21563..25384 - /gene="S" - CDS 21563..25384 - /gene="S" - /note="structural protein" - /codon_start=1 - /product="surface glycoprotein" - /protein_id="QHD43416.1" - /translation="MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFR - SSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIR - GWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVY - SSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQ - GFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFL - LKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITN - LCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCF - TNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYN - YLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPY - RVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFG - RDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAI - HADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPR - RARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTM - YICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFG - GFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFN - GLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQN - VLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGA - ISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMS - ECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAH - FPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELD - SFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELG - KYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSE - PVLKGVKLHYT" - gene 25393..26220 - /gene="ORF3a" - CDS 25393..26220 - /gene="ORF3a" - /codon_start=1 - /product="ORF3a protein" - /protein_id="QHD43417.1" - /translation="MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFG - WLIVGVALLAVFQSASKIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLE - APFLYLYALVYFLQSINFVRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPY - NSVTSSIVITSGDGTTSPISEHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQ - LSTDTGVEHVTFFIYNKIVDEPEEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL" - gene 26245..26472 - /gene="E" - CDS 26245..26472 - /gene="E" - /note="structural protein; E protein" - /codon_start=1 - /product="envelope protein" - /protein_id="QHD43418.1" - /translation="MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCC - NIVNVSLVKPSFYVYSRVKNLNSSRVPDLLV" - gene 26523..27191 - /gene="M" - CDS 26523..27191 - /gene="M" - /note="structural protein" - /codon_start=1 - /product="membrane glycoprotein" - /protein_id="QHD43419.1" - /translation="MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNR - FLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRL - FARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCD - IKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIA - LLVQ" - gene 27202..27387 - /gene="ORF6" - CDS 27202..27387 - /gene="ORF6" - /codon_start=1 - /product="ORF6 protein" - /protein_id="QHD43420.1" - /translation="MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL - TENKYSQLDEEQPMEID" - gene 27394..27759 - /gene="ORF7a" - CDS 27394..27759 - /gene="ORF7a" - /codon_start=1 - /product="ORF7a protein" - /protein_id="QHD43421.1" - /translation="MKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS - PFHPLADNKFALTCFSTQFAFACPDGVKHVYQLRARSVSPKLFIRQEEVQELYSPIFL - IVAAIVFITLCFTLKRKTE" - gene 27894..28259 - /gene="ORF8" - CDS 27894..28259 - /gene="ORF8" - /codon_start=1 - /product="ORF8 protein" - /protein_id="QHD43422.1" - /translation="MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSK - WYIRVGARKSAPLIELCVDEAGSKSPIQYIDIGNYTVSCLPFTINCQEPKLGSLVVRC - SFYEDFLEYHDVRVVLDFI" - gene 28274..29533 - /gene="N" - CDS 28274..29533 - /gene="N" - /note="structural protein" - /codon_start=1 - /product="nucleocapsid phosphoprotein" - /protein_id="QHD43423.2" - /translation="MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQG - LPNNTASWFTALTQHGKEDLKFPRGQGVPINTNSSPDDQIGYYRRATRRIRGGDGKMK - DLSPRWYFYYLGTGPEAGLPYGANKDGIIWVATEGALNTPKDHIGTRNPANNAAIVLQ - LPQGTTLPKGFYAEGSRGGSQASSRSSSRSRNSSRNSTPGSSRGTSPARMAGNGGDAA - LALLLLDRLNQLESKMSGKGQQQQGQTVTKKSAAEASKKPRQKRTATKAYNVTQAFGR - RGPEQTQGNFGDQELIRQGTDYKHWPQIAQFAPSASAFFGMSRIGMEVTPSGTWLTYT - GAIKLDDKDPNFKDQVILLNKHIDAYKTFPPTEPKKDKKKKADETQALPQRQKKQQTV - TLLPAADLDDFSKQLQQSMSSADSTQA" - gene 29558..29674 - /gene="ORF10" - CDS 29558..29674 - /gene="ORF10" - /codon_start=1 - /product="ORF10 protein" - /protein_id="QHI42199.1" - /translation="MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT" - 3'UTR 29675..29903 -ORIGIN - 1 attaaaggtt tataccttcc caggtaacaa accaaccaac tttcgatctc ttgtagatct - 61 gttctctaaa cgaactttaa aatctgtgtg gctgtcactc ggctgcatgc ttagtgcact - 121 cacgcagtat aattaataac taattactgt cgttgacagg acacgagtaa ctcgtctatc - 181 ttctgcaggc tgcttacggt ttcgtccgtg ttgcagccga tcatcagcac atctaggttt - 241 cgtccgggtg tgaccgaaag gtaagatgga gagccttgtc cctggtttca acgagaaaac - 301 acacgtccaa ctcagtttgc ctgttttaca ggttcgcgac gtgctcgtac gtggctttgg - 361 agactccgtg gaggaggtct tatcagaggc acgtcaacat cttaaagatg gcacttgtgg - 421 cttagtagaa gttgaaaaag gcgttttgcc tcaacttgaa cagccctatg tgttcatcaa - 481 acgttcggat gctcgaactg cacctcatgg tcatgttatg gttgagctgg tagcagaact - 541 cgaaggcatt cagtacggtc gtagtggtga gacacttggt gtccttgtcc ctcatgtggg - 601 cgaaatacca gtggcttacc gcaaggttct tcttcgtaag aacggtaata aaggagctgg - 661 tggccatagt tacggcgccg atctaaagtc atttgactta ggcgacgagc ttggcactga - 721 tccttatgaa gattttcaag aaaactggaa cactaaacat agcagtggtg ttacccgtga - 781 actcatgcgt gagcttaacg gaggggcata cactcgctat gtcgataaca acttctgtgg - 841 ccctgatggc taccctcttg agtgcattaa agaccttcta gcacgtgctg gtaaagcttc - 901 atgcactttg tccgaacaac tggactttat tgacactaag aggggtgtat actgctgccg - 961 tgaacatgag catgaaattg cttggtacac ggaacgttct gaaaagagct atgaattgca - 1021 gacacctttt gaaattaaat tggcaaagaa atttgacacc ttcaatgggg aatgtccaaa - 1081 ttttgtattt cccttaaatt ccataatcaa gactattcaa ccaagggttg aaaagaaaaa - 1141 gcttgatggc tttatgggta gaattcgatc tgtctatcca gttgcgtcac caaatgaatg - 1201 caaccaaatg tgcctttcaa ctctcatgaa gtgtgatcat tgtggtgaaa cttcatggca - 1261 gacgggcgat tttgttaaag ccacttgcga attttgtggc actgagaatt tgactaaaga - 1321 aggtgccact acttgtggtt acttacccca aaatgctgtt gttaaaattt attgtccagc - 1381 atgtcacaat tcagaagtag gacctgagca tagtcttgcc gaataccata atgaatctgg - 1441 cttgaaaacc attcttcgta agggtggtcg cactattgcc tttggaggct gtgtgttctc - 1501 ttatgttggt tgccataaca agtgtgccta ttgggttcca cgtgctagcg ctaacatagg - 1561 ttgtaaccat acaggtgttg ttggagaagg ttccgaaggt cttaatgaca accttcttga - 1621 aatactccaa aaagagaaag tcaacatcaa tattgttggt gactttaaac ttaatgaaga - 1681 gatcgccatt attttggcat ctttttctgc ttccacaagt gcttttgtgg aaactgtgaa - 1741 aggtttggat tataaagcat tcaaacaaat tgttgaatcc tgtggtaatt ttaaagttac - 1801 aaaaggaaaa gctaaaaaag gtgcctggaa tattggtgaa cagaaatcaa tactgagtcc - 1861 tctttatgca tttgcatcag aggctgctcg tgttgtacga tcaattttct cccgcactct - 1921 tgaaactgct caaaattctg tgcgtgtttt acagaaggcc gctataacaa tactagatgg - 1981 aatttcacag tattcactga gactcattga tgctatgatg ttcacatctg atttggctac - 2041 taacaatcta gttgtaatgg cctacattac aggtggtgtt gttcagttga cttcgcagtg - 2101 gctaactaac atctttggca ctgtttatga aaaactcaaa cccgtccttg attggcttga - 2161 agagaagttt aaggaaggtg tagagtttct tagagacggt tgggaaattg ttaaatttat - 2221 ctcaacctgt gcttgtgaaa ttgtcggtgg acaaattgtc acctgtgcaa aggaaattaa - 2281 ggagagtgtt cagacattct ttaagcttgt aaataaattt ttggctttgt gtgctgactc - 2341 tatcattatt ggtggagcta aacttaaagc cttgaattta ggtgaaacat ttgtcacgca - 2401 ctcaaaggga ttgtacagaa agtgtgttaa atccagagaa gaaactggcc tactcatgcc - 2461 tctaaaagcc ccaaaagaaa ttatcttctt agagggagaa acacttccca cagaagtgtt - 2521 aacagaggaa gttgtcttga aaactggtga tttacaacca ttagaacaac ctactagtga - 2581 agctgttgaa gctccattgg ttggtacacc agtttgtatt aacgggctta tgttgctcga - 2641 aatcaaagac acagaaaagt actgtgccct tgcacctaat atgatggtaa caaacaatac - 2701 cttcacactc aaaggcggtg caccaacaaa ggttactttt ggtgatgaca ctgtgataga - 2761 agtgcaaggt tacaagagtg tgaatatcac ttttgaactt gatgaaagga ttgataaagt - 2821 acttaatgag aagtgctctg cctatacagt tgaactcggt acagaagtaa atgagttcgc - 2881 ctgtgttgtg gcagatgctg tcataaaaac tttgcaacca gtatctgaat tacttacacc - 2941 actgggcatt gatttagatg agtggagtat ggctacatac tacttatttg atgagtctgg - 3001 tgagtttaaa ttggcttcac atatgtattg ttctttctac cctccagatg aggatgaaga - 3061 agaaggtgat tgtgaagaag aagagtttga gccatcaact caatatgagt atggtactga - 3121 agatgattac caaggtaaac ctttggaatt tggtgccact tctgctgctc ttcaacctga - 3181 agaagagcaa gaagaagatt ggttagatga tgatagtcaa caaactgttg gtcaacaaga - 3241 cggcagtgag gacaatcaga caactactat tcaaacaatt gttgaggttc aacctcaatt - 3301 agagatggaa cttacaccag ttgttcagac tattgaagtg aatagtttta gtggttattt - 3361 aaaacttact gacaatgtat acattaaaaa tgcagacatt gtggaagaag ctaaaaaggt - 3421 aaaaccaaca gtggttgtta atgcagccaa tgtttacctt aaacatggag gaggtgttgc - 3481 aggagcctta aataaggcta ctaacaatgc catgcaagtt gaatctgatg attacatagc - 3541 tactaatgga ccacttaaag tgggtggtag ttgtgtttta agcggacaca atcttgctaa - 3601 acactgtctt catgttgtcg gcccaaatgt taacaaaggt gaagacattc aacttcttaa - 3661 gagtgcttat gaaaatttta atcagcacga agttctactt gcaccattat tatcagctgg - 3721 tatttttggt gctgacccta tacattcttt aagagtttgt gtagatactg ttcgcacaaa - 3781 tgtctactta gctgtctttg ataaaaatct ctatgacaaa cttgtttcaa gctttttgga - 3841 aatgaagagt gaaaagcaag ttgaacaaaa gatcgctgag attcctaaag aggaagttaa - 3901 gccatttata actgaaagta aaccttcagt tgaacagaga aaacaagatg ataagaaaat - 3961 caaagcttgt gttgaagaag ttacaacaac tctggaagaa actaagttcc tcacagaaaa - 4021 cttgttactt tatattgaca ttaatggcaa tcttcatcca gattctgcca ctcttgttag - 4081 tgacattgac atcactttct taaagaaaga tgctccatat atagtgggtg atgttgttca - 4141 agagggtgtt ttaactgctg tggttatacc tactaaaaag gctggtggca ctactgaaat - 4201 gctagcgaaa gctttgagaa aagtgccaac agacaattat ataaccactt acccgggtca - 4261 gggtttaaat ggttacactg tagaggaggc aaagacagtg cttaaaaagt gtaaaagtgc - 4321 cttttacatt ctaccatcta ttatctctaa tgagaagcaa gaaattcttg gaactgtttc - 4381 ttggaatttg cgagaaatgc ttgcacatgc agaagaaaca cgcaaattaa tgcctgtctg - 4441 tgtggaaact aaagccatag tttcaactat acagcgtaaa tataagggta ttaaaataca - 4501 agagggtgtg gttgattatg gtgctagatt ttacttttac accagtaaaa caactgtagc - 4561 gtcacttatc aacacactta acgatctaaa tgaaactctt gttacaatgc cacttggcta - 4621 tgtaacacat ggcttaaatt tggaagaagc tgctcggtat atgagatctc tcaaagtgcc - 4681 agctacagtt tctgtttctt cacctgatgc tgttacagcg tataatggtt atcttacttc - 4741 ttcttctaaa acacctgaag aacattttat tgaaaccatc tcacttgctg gttcctataa - 4801 agattggtcc tattctggac aatctacaca actaggtata gaatttctta agagaggtga - 4861 taaaagtgta tattacacta gtaatcctac cacattccac ctagatggtg aagttatcac - 4921 ctttgacaat cttaagacac ttctttcttt gagagaagtg aggactatta aggtgtttac - 4981 aacagtagac aacattaacc tccacacgca agttgtggac atgtcaatga catatggaca - 5041 acagtttggt ccaacttatt tggatggagc tgatgttact aaaataaaac ctcataattc - 5101 acatgaaggt aaaacatttt atgttttacc taatgatgac actctacgtg ttgaggcttt - 5161 tgagtactac cacacaactg atcctagttt tctgggtagg tacatgtcag cattaaatca - 5221 cactaaaaag tggaaatacc cacaagttaa tggtttaact tctattaaat gggcagataa - 5281 caactgttat cttgccactg cattgttaac actccaacaa atagagttga agtttaatcc - 5341 acctgctcta caagatgctt attacagagc aagggctggt gaagctgcta acttttgtgc - 5401 acttatctta gcctactgta ataagacagt aggtgagtta ggtgatgtta gagaaacaat - 5461 gagttacttg tttcaacatg ccaatttaga ttcttgcaaa agagtcttga acgtggtgtg - 5521 taaaacttgt ggacaacagc agacaaccct taagggtgta gaagctgtta tgtacatggg - 5581 cacactttct tatgaacaat ttaagaaagg tgttcagata ccttgtacgt gtggtaaaca - 5641 agctacaaaa tatctagtac aacaggagtc accttttgtt atgatgtcag caccacctgc - 5701 tcagtatgaa cttaagcatg gtacatttac ttgtgctagt gagtacactg gtaattacca - 5761 gtgtggtcac tataaacata taacttctaa agaaactttg tattgcatag acggtgcttt - 5821 acttacaaag tcctcagaat acaaaggtcc tattacggat gttttctaca aagaaaacag - 5881 ttacacaaca accataaaac cagttactta taaattggat ggtgttgttt gtacagaaat - 5941 tgaccctaag ttggacaatt attataagaa agacaattct tatttcacag agcaaccaat - 6001 tgatcttgta ccaaaccaac catatccaaa cgcaagcttc gataatttta agtttgtatg - 6061 tgataatatc aaatttgctg atgatttaaa ccagttaact ggttataaga aacctgcttc - 6121 aagagagctt aaagttacat ttttccctga cttaaatggt gatgtggtgg ctattgatta - 6181 taaacactac acaccctctt ttaagaaagg agctaaattg ttacataaac ctattgtttg - 6241 gcatgttaac aatgcaacta ataaagccac gtataaacca aatacctggt gtatacgttg - 6301 tctttggagc acaaaaccag ttgaaacatc aaattcgttt gatgtactga agtcagagga - 6361 cgcgcaggga atggataatc ttgcctgcga agatctaaaa ccagtctctg aagaagtagt - 6421 ggaaaatcct accatacaga aagacgttct tgagtgtaat gtgaaaacta ccgaagttgt - 6481 aggagacatt atacttaaac cagcaaataa tagtttaaaa attacagaag aggttggcca - 6541 cacagatcta atggctgctt atgtagacaa ttctagtctt actattaaga aacctaatga - 6601 attatctaga gtattaggtt tgaaaaccct tgctactcat ggtttagctg ctgttaatag - 6661 tgtcccttgg gatactatag ctaattatgc taagcctttt cttaacaaag ttgttagtac - 6721 aactactaac atagttacac ggtgtttaaa ccgtgtttgt actaattata tgccttattt - 6781 ctttacttta ttgctacaat tgtgtacttt tactagaagt acaaattcta gaattaaagc - 6841 atctatgccg actactatag caaagaatac tgttaagagt gtcggtaaat tttgtctaga - 6901 ggcttcattt aattatttga agtcacctaa tttttctaaa ctgataaata ttataatttg - 6961 gtttttacta ttaagtgttt gcctaggttc tttaatctac tcaaccgctg ctttaggtgt - 7021 tttaatgtct aatttaggca tgccttctta ctgtactggt tacagagaag gctatttgaa - 7081 ctctactaat gtcactattg caacctactg tactggttct ataccttgta gtgtttgtct - 7141 tagtggttta gattctttag acacctatcc ttctttagaa actatacaaa ttaccatttc - 7201 atcttttaaa tgggatttaa ctgcttttgg cttagttgca gagtggtttt tggcatatat - 7261 tcttttcact aggtttttct atgtacttgg attggctgca atcatgcaat tgtttttcag - 7321 ctattttgca gtacatttta ttagtaattc ttggcttatg tggttaataa ttaatcttgt - 7381 acaaatggcc ccgatttcag ctatggttag aatgtacatc ttctttgcat cattttatta - 7441 tgtatggaaa agttatgtgc atgttgtaga cggttgtaat tcatcaactt gtatgatgtg - 7501 ttacaaacgt aatagagcaa caagagtcga atgtacaact attgttaatg gtgttagaag - 7561 gtccttttat gtctatgcta atggaggtaa aggcttttgc aaactacaca attggaattg - 7621 tgttaattgt gatacattct gtgctggtag tacatttatt agtgatgaag ttgcgagaga - 7681 cttgtcacta cagtttaaaa gaccaataaa tcctactgac cagtcttctt acatcgttga - 7741 tagtgttaca gtgaagaatg gttccatcca tctttacttt gataaagctg gtcaaaagac - 7801 ttatgaaaga cattctctct ctcattttgt taacttagac aacctgagag ctaataacac - 7861 taaaggttca ttgcctatta atgttatagt ttttgatggt aaatcaaaat gtgaagaatc - 7921 atctgcaaaa tcagcgtctg tttactacag tcagcttatg tgtcaaccta tactgttact - 7981 agatcaggca ttagtgtctg atgttggtga tagtgcggaa gttgcagtta aaatgtttga - 8041 tgcttacgtt aatacgtttt catcaacttt taacgtacca atggaaaaac tcaaaacact - 8101 agttgcaact gcagaagctg aacttgcaaa gaatgtgtcc ttagacaatg tcttatctac - 8161 ttttatttca gcagctcggc aagggtttgt tgattcagat gtagaaacta aagatgttgt - 8221 tgaatgtctt aaattgtcac atcaatctga catagaagtt actggcgata gttgtaataa - 8281 ctatatgctc acctataaca aagttgaaaa catgacaccc cgtgaccttg gtgcttgtat - 8341 tgactgtagt gcgcgtcata ttaatgcgca ggtagcaaaa agtcacaaca ttgctttgat - 8401 atggaacgtt aaagatttca tgtcattgtc tgaacaacta cgaaaacaaa tacgtagtgc - 8461 tgctaaaaag aataacttac cttttaagtt gacatgtgca actactagac aagttgttaa - 8521 tgttgtaaca acaaagatag cacttaaggg tggtaaaatt gttaataatt ggttgaagca - 8581 gttaattaaa gttacacttg tgttcctttt tgttgctgct attttctatt taataacacc - 8641 tgttcatgtc atgtctaaac atactgactt ttcaagtgaa atcataggat acaaggctat - 8701 tgatggtggt gtcactcgtg acatagcatc tacagatact tgttttgcta acaaacatgc - 8761 tgattttgac acatggttta gccagcgtgg tggtagttat actaatgaca aagcttgccc - 8821 attgattgct gcagtcataa caagagaagt gggttttgtc gtgcctggtt tgcctggcac - 8881 gatattacgc acaactaatg gtgacttttt gcatttctta cctagagttt ttagtgcagt - 8941 tggtaacatc tgttacacac catcaaaact tatagagtac actgactttg caacatcagc - 9001 ttgtgttttg gctgctgaat gtacaatttt taaagatgct tctggtaagc cagtaccata - 9061 ttgttatgat accaatgtac tagaaggttc tgttgcttat gaaagtttac gccctgacac - 9121 acgttatgtg ctcatggatg gctctattat tcaatttcct aacacctacc ttgaaggttc - 9181 tgttagagtg gtaacaactt ttgattctga gtactgtagg cacggcactt gtgaaagatc - 9241 agaagctggt gtttgtgtat ctactagtgg tagatgggta cttaacaatg attattacag - 9301 atctttacca ggagttttct gtggtgtaga tgctgtaaat ttacttacta atatgtttac - 9361 accactaatt caacctattg gtgctttgga catatcagca tctatagtag ctggtggtat - 9421 tgtagctatc gtagtaacat gccttgccta ctattttatg aggtttagaa gagcttttgg - 9481 tgaatacagt catgtagttg cctttaatac tttactattc cttatgtcat tcactgtact - 9541 ctgtttaaca ccagtttact cattcttacc tggtgtttat tctgttattt acttgtactt - 9601 gacattttat cttactaatg atgtttcttt tttagcacat attcagtgga tggttatgtt - 9661 cacaccttta gtacctttct ggataacaat tgcttatatc atttgtattt ccacaaagca - 9721 tttctattgg ttctttagta attacctaaa gagacgtgta gtctttaatg gtgtttcctt - 9781 tagtactttt gaagaagctg cgctgtgcac ctttttgtta aataaagaaa tgtatctaaa - 9841 gttgcgtagt gatgtgctat tacctcttac gcaatataat agatacttag ctctttataa - 9901 taagtacaag tattttagtg gagcaatgga tacaactagc tacagagaag ctgcttgttg - 9961 tcatctcgca aaggctctca atgacttcag taactcaggt tctgatgttc tttaccaacc - 10021 accacaaacc tctatcacct cagctgtttt gcagagtggt tttagaaaaa tggcattccc - 10081 atctggtaaa gttgagggtt gtatggtaca agtaacttgt ggtacaacta cacttaacgg - 10141 tctttggctt gatgacgtag tttactgtcc aagacatgtg atctgcacct ctgaagacat - 10201 gcttaaccct aattatgaag atttactcat tcgtaagtct aatcataatt tcttggtaca - 10261 ggctggtaat gttcaactca gggttattgg acattctatg caaaattgtg tacttaagct - 10321 taaggttgat acagccaatc ctaagacacc taagtataag tttgttcgca ttcaaccagg - 10381 acagactttt tcagtgttag cttgttacaa tggttcacca tctggtgttt accaatgtgc - 10441 tatgaggccc aatttcacta ttaagggttc attccttaat ggttcatgtg gtagtgttgg - 10501 ttttaacata gattatgact gtgtctcttt ttgttacatg caccatatgg aattaccaac - 10561 tggagttcat gctggcacag acttagaagg taacttttat ggaccttttg ttgacaggca - 10621 aacagcacaa gcagctggta cggacacaac tattacagtt aatgttttag cttggttgta - 10681 cgctgctgtt ataaatggag acaggtggtt tctcaatcga tttaccacaa ctcttaatga - 10741 ctttaacctt gtggctatga agtacaatta tgaacctcta acacaagacc atgttgacat - 10801 actaggacct ctttctgctc aaactggaat tgccgtttta gatatgtgtg cttcattaaa - 10861 agaattactg caaaatggta tgaatggacg taccatattg ggtagtgctt tattagaaga - 10921 tgaatttaca ccttttgatg ttgttagaca atgctcaggt gttactttcc aaagtgcagt - 10981 gaaaagaaca atcaagggta cacaccactg gttgttactc acaattttga cttcactttt - 11041 agttttagtc cagagtactc aatggtcttt gttctttttt ttgtatgaaa atgccttttt - 11101 accttttgct atgggtatta ttgctatgtc tgcttttgca atgatgtttg tcaaacataa - 11161 gcatgcattt ctctgtttgt ttttgttacc ttctcttgcc actgtagctt attttaatat - 11221 ggtctatatg cctgctagtt gggtgatgcg tattatgaca tggttggata tggttgatac - 11281 tagtttgtct ggttttaagc taaaagactg tgttatgtat gcatcagctg tagtgttact - 11341 aatccttatg acagcaagaa ctgtgtatga tgatggtgct aggagagtgt ggacacttat - 11401 gaatgtcttg acactcgttt ataaagttta ttatggtaat gctttagatc aagccatttc - 11461 catgtgggct cttataatct ctgttacttc taactactca ggtgtagtta caactgtcat - 11521 gtttttggcc agaggtattg tttttatgtg tgttgagtat tgccctattt tcttcataac - 11581 tggtaataca cttcagtgta taatgctagt ttattgtttc ttaggctatt tttgtacttg - 11641 ttactttggc ctcttttgtt tactcaaccg ctactttaga ctgactcttg gtgtttatga - 11701 ttacttagtt tctacacagg agtttagata tatgaattca cagggactac tcccacccaa - 11761 gaatagcata gatgccttca aactcaacat taaattgttg ggtgttggtg gcaaaccttg - 11821 tatcaaagta gccactgtac agtctaaaat gtcagatgta aagtgcacat cagtagtctt - 11881 actctcagtt ttgcaacaac tcagagtaga atcatcatct aaattgtggg ctcaatgtgt - 11941 ccagttacac aatgacattc tcttagctaa agatactact gaagcctttg aaaaaatggt - 12001 ttcactactt tctgttttgc tttccatgca gggtgctgta gacataaaca agctttgtga - 12061 agaaatgctg gacaacaggg caaccttaca agctatagcc tcagagttta gttcccttcc - 12121 atcatatgca gcttttgcta ctgctcaaga agcttatgag caggctgttg ctaatggtga - 12181 ttctgaagtt gttcttaaaa agttgaagaa gtctttgaat gtggctaaat ctgaatttga - 12241 ccgtgatgca gccatgcaac gtaagttgga aaagatggct gatcaagcta tgacccaaat - 12301 gtataaacag gctagatctg aggacaagag ggcaaaagtt actagtgcta tgcagacaat - 12361 gcttttcact atgcttagaa agttggataa tgatgcactc aacaacatta tcaacaatgc - 12421 aagagatggt tgtgttccct tgaacataat acctcttaca acagcagcca aactaatggt - 12481 tgtcatacca gactataaca catataaaaa tacgtgtgat ggtacaacat ttacttatgc - 12541 atcagcattg tgggaaatcc aacaggttgt agatgcagat agtaaaattg ttcaacttag - 12601 tgaaattagt atggacaatt cacctaattt agcatggcct cttattgtaa cagctttaag - 12661 ggccaattct gctgtcaaat tacagaataa tgagcttagt cctgttgcac tacgacagat - 12721 gtcttgtgct gccggtacta cacaaactgc ttgcactgat gacaatgcgt tagcttacta - 12781 caacacaaca aagggaggta ggtttgtact tgcactgtta tccgatttac aggatttgaa - 12841 atgggctaga ttccctaaga gtgatggaac tggtactatc tatacagaac tggaaccacc - 12901 ttgtaggttt gttacagaca cacctaaagg tcctaaagtg aagtatttat actttattaa - 12961 aggattaaac aacctaaata gaggtatggt acttggtagt ttagctgcca cagtacgtct - 13021 acaagctggt aatgcaacag aagtgcctgc caattcaact gtattatctt tctgtgcttt - 13081 tgctgtagat gctgctaaag cttacaaaga ttatctagct agtgggggac aaccaatcac - 13141 taattgtgtt aagatgttgt gtacacacac tggtactggt caggcaataa cagttacacc - 13201 ggaagccaat atggatcaag aatcctttgg tggtgcatcg tgttgtctgt actgccgttg - 13261 ccacatagat catccaaatc ctaaaggatt ttgtgactta aaaggtaagt atgtacaaat - 13321 acctacaact tgtgctaatg accctgtggg ttttacactt aaaaacacag tctgtaccgt - 13381 ctgcggtatg tggaaaggtt atggctgtag ttgtgatcaa ctccgcgaac ccatgcttca - 13441 gtcagctgat gcacaatcgt ttttaaacgg gtttgcggtg taagtgcagc ccgtcttaca - 13501 ccgtgcggca caggcactag tactgatgtc gtatacaggg cttttgacat ctacaatgat - 13561 aaagtagctg gttttgctaa attcctaaaa actaattgtt gtcgcttcca agaaaaggac - 13621 gaagatgaca atttaattga ttcttacttt gtagttaaga gacacacttt ctctaactac - 13681 caacatgaag aaacaattta taatttactt aaggattgtc cagctgttgc taaacatgac - 13741 ttctttaagt ttagaataga cggtgacatg gtaccacata tatcacgtca acgtcttact - 13801 aaatacacaa tggcagacct cgtctatgct ttaaggcatt ttgatgaagg taattgtgac - 13861 acattaaaag aaatacttgt cacatacaat tgttgtgatg atgattattt caataaaaag - 13921 gactggtatg attttgtaga aaacccagat atattacgcg tatacgccaa cttaggtgaa - 13981 cgtgtacgcc aagctttgtt aaaaacagta caattctgtg atgccatgcg aaatgctggt - 14041 attgttggtg tactgacatt agataatcaa gatctcaatg gtaactggta tgatttcggt - 14101 gatttcatac aaaccacgcc aggtagtgga gttcctgttg tagattctta ttattcattg - 14161 ttaatgccta tattaacctt gaccagggct ttaactgcag agtcacatgt tgacactgac - 14221 ttaacaaagc cttacattaa gtgggatttg ttaaaatatg acttcacgga agagaggtta - 14281 aaactctttg accgttattt taaatattgg gatcagacat accacccaaa ttgtgttaac - 14341 tgtttggatg acagatgcat tctgcattgt gcaaacttta atgttttatt ctctacagtg - 14401 ttcccaccta caagttttgg accactagtg agaaaaatat ttgttgatgg tgttccattt - 14461 gtagtttcaa ctggatacca cttcagagag ctaggtgttg tacataatca ggatgtaaac - 14521 ttacatagct ctagacttag ttttaaggaa ttacttgtgt atgctgctga ccctgctatg - 14581 cacgctgctt ctggtaatct attactagat aaacgcacta cgtgcttttc agtagctgca - 14641 cttactaaca atgttgcttt tcaaactgtc aaacccggta attttaacaa agacttctat - 14701 gactttgctg tgtctaaggg tttctttaag gaaggaagtt ctgttgaatt aaaacacttc - 14761 ttctttgctc aggatggtaa tgctgctatc agcgattatg actactatcg ttataatcta - 14821 ccaacaatgt gtgatatcag acaactacta tttgtagttg aagttgttga taagtacttt - 14881 gattgttacg atggtggctg tattaatgct aaccaagtca tcgtcaacaa cctagacaaa - 14941 tcagctggtt ttccatttaa taaatggggt aaggctagac tttattatga ttcaatgagt - 15001 tatgaggatc aagatgcact tttcgcatat acaaaacgta atgtcatccc tactataact - 15061 caaatgaatc ttaagtatgc cattagtgca aagaatagag ctcgcaccgt agctggtgtc - 15121 tctatctgta gtactatgac caatagacag tttcatcaaa aattattgaa atcaatagcc - 15181 gccactagag gagctactgt agtaattgga acaagcaaat tctatggtgg ttggcacaac - 15241 atgttaaaaa ctgtttatag tgatgtagaa aaccctcacc ttatgggttg ggattatcct - 15301 aaatgtgata gagccatgcc taacatgctt agaattatgg cctcacttgt tcttgctcgc - 15361 aaacatacaa cgtgttgtag cttgtcacac cgtttctata gattagctaa tgagtgtgct - 15421 caagtattga gtgaaatggt catgtgtggc ggttcactat atgttaaacc aggtggaacc - 15481 tcatcaggag atgccacaac tgcttatgct aatagtgttt ttaacatttg tcaagctgtc - 15541 acggccaatg ttaatgcact tttatctact gatggtaaca aaattgccga taagtatgtc - 15601 cgcaatttac aacacagact ttatgagtgt ctctatagaa atagagatgt tgacacagac - 15661 tttgtgaatg agttttacgc atatttgcgt aaacatttct caatgatgat actctctgac - 15721 gatgctgttg tgtgtttcaa tagcacttat gcatctcaag gtctagtggc tagcataaag - 15781 aactttaagt cagttcttta ttatcaaaac aatgttttta tgtctgaagc aaaatgttgg - 15841 actgagactg accttactaa aggacctcat gaattttgct ctcaacatac aatgctagtt - 15901 aaacagggtg atgattatgt gtaccttcct tacccagatc catcaagaat cctaggggcc - 15961 ggctgttttg tagatgatat cgtaaaaaca gatggtacac ttatgattga acggttcgtg - 16021 tctttagcta tagatgctta cccacttact aaacatccta atcaggagta tgctgatgtc - 16081 tttcatttgt acttacaata cataagaaag ctacatgatg agttaacagg acacatgtta - 16141 gacatgtatt ctgttatgct tactaatgat aacacttcaa ggtattggga acctgagttt - 16201 tatgaggcta tgtacacacc gcatacagtc ttacaggctg ttggggcttg tgttctttgc - 16261 aattcacaga cttcattaag atgtggtgct tgcatacgta gaccattctt atgttgtaaa - 16321 tgctgttacg accatgtcat atcaacatca cataaattag tcttgtctgt taatccgtat - 16381 gtttgcaatg ctccaggttg tgatgtcaca gatgtgactc aactttactt aggaggtatg - 16441 agctattatt gtaaatcaca taaaccaccc attagttttc cattgtgtgc taatggacaa - 16501 gtttttggtt tatataaaaa tacatgtgtt ggtagcgata atgttactga ctttaatgca - 16561 attgcaacat gtgactggac aaatgctggt gattacattt tagctaacac ctgtactgaa - 16621 agactcaagc tttttgcagc agaaacgctc aaagctactg aggagacatt taaactgtct - 16681 tatggtattg ctactgtacg tgaagtgctg tctgacagag aattacatct ttcatgggaa - 16741 gttggtaaac ctagaccacc acttaaccga aattatgtct ttactggtta tcgtgtaact - 16801 aaaaacagta aagtacaaat aggagagtac acctttgaaa aaggtgacta tggtgatgct - 16861 gttgtttacc gaggtacaac aacttacaaa ttaaatgttg gtgattattt tgtgctgaca - 16921 tcacatacag taatgccatt aagtgcacct acactagtgc cacaagagca ctatgttaga - 16981 attactggct tatacccaac actcaatatc tcagatgagt tttctagcaa tgttgcaaat - 17041 tatcaaaagg ttggtatgca aaagtattct acactccagg gaccacctgg tactggtaag - 17101 agtcattttg ctattggcct agctctctac tacccttctg ctcgcatagt gtatacagct - 17161 tgctctcatg ccgctgttga tgcactatgt gagaaggcat taaaatattt gcctatagat - 17221 aaatgtagta gaattatacc tgcacgtgct cgtgtagagt gttttgataa attcaaagtg - 17281 aattcaacat tagaacagta tgtcttttgt actgtaaatg cattgcctga gacgacagca - 17341 gatatagttg tctttgatga aatttcaatg gccacaaatt atgatttgag tgttgtcaat - 17401 gccagattac gtgctaagca ctatgtgtac attggcgacc ctgctcaatt acctgcacca - 17461 cgcacattgc taactaaggg cacactagaa ccagaatatt tcaattcagt gtgtagactt - 17521 atgaaaacta taggtccaga catgttcctc ggaacttgtc ggcgttgtcc tgctgaaatt - 17581 gttgacactg tgagtgcttt ggtttatgat aataagctta aagcacataa agacaaatca - 17641 gctcaatgct ttaaaatgtt ttataagggt gttatcacgc atgatgtttc atctgcaatt - 17701 aacaggccac aaataggcgt ggtaagagaa ttccttacac gtaaccctgc ttggagaaaa - 17761 gctgtcttta tttcacctta taattcacag aatgctgtag cctcaaagat tttgggacta - 17821 ccaactcaaa ctgttgattc atcacagggc tcagaatatg actatgtcat attcactcaa - 17881 accactgaaa cagctcactc ttgtaatgta aacagattta atgttgctat taccagagca - 17941 aaagtaggca tactttgcat aatgtctgat agagaccttt atgacaagtt gcaatttaca - 18001 agtcttgaaa ttccacgtag gaatgtggca actttacaag ctgaaaatgt aacaggactc - 18061 tttaaagatt gtagtaaggt aatcactggg ttacatccta cacaggcacc tacacacctc - 18121 agtgttgaca ctaaattcaa aactgaaggt ttatgtgttg acatacctgg catacctaag - 18181 gacatgacct atagaagact catctctatg atgggtttta aaatgaatta tcaagttaat - 18241 ggttacccta acatgtttat cacccgcgaa gaagctataa gacatgtacg tgcatggatt - 18301 ggcttcgatg tcgaggggtg tcatgctact agagaagctg ttggtaccaa tttaccttta - 18361 cagctaggtt tttctacagg tgttaaccta gttgctgtac ctacaggtta tgttgataca - 18421 cctaataata cagatttttc cagagttagt gctaaaccac cgcctggaga tcaatttaaa - 18481 cacctcatac cacttatgta caaaggactt ccttggaatg tagtgcgtat aaagattgta - 18541 caaatgttaa gtgacacact taaaaatctc tctgacagag tcgtatttgt cttatgggca - 18601 catggctttg agttgacatc tatgaagtat tttgtgaaaa taggacctga gcgcacctgt - 18661 tgtctatgtg atagacgtgc cacatgcttt tccactgctt cagacactta tgcctgttgg - 18721 catcattcta ttggatttga ttacgtctat aatccgttta tgattgatgt tcaacaatgg - 18781 ggttttacag gtaacctaca aagcaaccat gatctgtatt gtcaagtcca tggtaatgca - 18841 catgtagcta gttgtgatgc aatcatgact aggtgtctag ctgtccacga gtgctttgtt - 18901 aagcgtgttg actggactat tgaatatcct ataattggtg atgaactgaa gattaatgcg - 18961 gcttgtagaa aggttcaaca catggttgtt aaagctgcat tattagcaga caaattccca - 19021 gttcttcacg acattggtaa ccctaaagct attaagtgtg tacctcaagc tgatgtagaa - 19081 tggaagttct atgatgcaca gccttgtagt gacaaagctt ataaaataga agaattattc - 19141 tattcttatg ccacacattc tgacaaattc acagatggtg tatgcctatt ttggaattgc - 19201 aatgtcgata gatatcctgc taattccatt gtttgtagat ttgacactag agtgctatct - 19261 aaccttaact tgcctggttg tgatggtggc agtttgtatg taaataaaca tgcattccac - 19321 acaccagctt ttgataaaag tgcttttgtt aatttaaaac aattaccatt tttctattac - 19381 tctgacagtc catgtgagtc tcatggaaaa caagtagtgt cagatataga ttatgtacca - 19441 ctaaagtctg ctacgtgtat aacacgttgc aatttaggtg gtgctgtctg tagacatcat - 19501 gctaatgagt acagattgta tctcgatgct tataacatga tgatctcagc tggctttagc - 19561 ttgtgggttt acaaacaatt tgatacttat aacctctgga acacttttac aagacttcag - 19621 agtttagaaa atgtggcttt taatgttgta aataagggac actttgatgg acaacagggt - 19681 gaagtaccag tttctatcat taataacact gtttacacaa aagttgatgg tgttgatgta - 19741 gaattgtttg aaaataaaac aacattacct gttaatgtag catttgagct ttgggctaag - 19801 cgcaacatta aaccagtacc agaggtgaaa atactcaata atttgggtgt ggacattgct - 19861 gctaatactg tgatctggga ctacaaaaga gatgctccag cacatatatc tactattggt - 19921 gtttgttcta tgactgacat agccaagaaa ccaactgaaa cgatttgtgc accactcact - 19981 gtcttttttg atggtagagt tgatggtcaa gtagacttat ttagaaatgc ccgtaatggt - 20041 gttcttatta cagaaggtag tgttaaaggt ttacaaccat ctgtaggtcc caaacaagct - 20101 agtcttaatg gagtcacatt aattggagaa gccgtaaaaa cacagttcaa ttattataag - 20161 aaagttgatg gtgttgtcca acaattacct gaaacttact ttactcagag tagaaattta - 20221 caagaattta aacccaggag tcaaatggaa attgatttct tagaattagc tatggatgaa - 20281 ttcattgaac ggtataaatt agaaggctat gccttcgaac atatcgttta tggagatttt - 20341 agtcatagtc agttaggtgg tttacatcta ctgattggac tagctaaacg ttttaaggaa - 20401 tcaccttttg aattagaaga ttttattcct atggacagta cagttaaaaa ctatttcata - 20461 acagatgcgc aaacaggttc atctaagtgt gtgtgttctg ttattgattt attacttgat - 20521 gattttgttg aaataataaa atcccaagat ttatctgtag tttctaaggt tgtcaaagtg - 20581 actattgact atacagaaat ttcatttatg ctttggtgta aagatggcca tgtagaaaca - 20641 ttttacccaa aattacaatc tagtcaagcg tggcaaccgg gtgttgctat gcctaatctt - 20701 tacaaaatgc aaagaatgct attagaaaag tgtgaccttc aaaattatgg tgatagtgca - 20761 acattaccta aaggcataat gatgaatgtc gcaaaatata ctcaactgtg tcaatattta - 20821 aacacattaa cattagctgt accctataat atgagagtta tacattttgg tgctggttct - 20881 gataaaggag ttgcaccagg tacagctgtt ttaagacagt ggttgcctac gggtacgctg - 20941 cttgtcgatt cagatcttaa tgactttgtc tctgatgcag attcaacttt gattggtgat - 21001 tgtgcaactg tacatacagc taataaatgg gatctcatta ttagtgatat gtacgaccct - 21061 aagactaaaa atgttacaaa agaaaatgac tctaaagagg gttttttcac ttacatttgt - 21121 gggtttatac aacaaaagct agctcttgga ggttccgtgg ctataaagat aacagaacat - 21181 tcttggaatg ctgatcttta taagctcatg ggacacttcg catggtggac agcctttgtt - 21241 actaatgtga atgcgtcatc atctgaagca tttttaattg gatgtaatta tcttggcaaa - 21301 ccacgcgaac aaatagatgg ttatgtcatg catgcaaatt acatattttg gaggaataca - 21361 aatccaattc agttgtcttc ctattcttta tttgacatga gtaaatttcc ccttaaatta - 21421 aggggtactg ctgttatgtc tttaaaagaa ggtcaaatca atgatatgat tttatctctt - 21481 cttagtaaag gtagacttat aattagagaa aacaacagag ttgttatttc tagtgatgtt - 21541 cttgttaaca actaaacgaa caatgtttgt ttttcttgtt ttattgccac tagtctctag - 21601 tcagtgtgtt aatcttacaa ccagaactca attaccccct gcatacacta attctttcac - 21661 acgtggtgtt tattaccctg acaaagtttt cagatcctca gttttacatt caactcagga - 21721 cttgttctta cctttctttt ccaatgttac ttggttccat gctatacatg tctctgggac - 21781 caatggtact aagaggtttg ataaccctgt cctaccattt aatgatggtg tttattttgc - 21841 ttccactgag aagtctaaca taataagagg ctggattttt ggtactactt tagattcgaa - 21901 gacccagtcc ctacttattg ttaataacgc tactaatgtt gttattaaag tctgtgaatt - 21961 tcaattttgt aatgatccat ttttgggtgt ttattaccac aaaaacaaca aaagttggat - 22021 ggaaagtgag ttcagagttt attctagtgc gaataattgc acttttgaat atgtctctca - 22081 gccttttctt atggaccttg aaggaaaaca gggtaatttc aaaaatctta gggaatttgt - 22141 gtttaagaat attgatggtt attttaaaat atattctaag cacacgccta ttaatttagt - 22201 gcgtgatctc cctcagggtt tttcggcttt agaaccattg gtagatttgc caataggtat - 22261 taacatcact aggtttcaaa ctttacttgc tttacataga agttatttga ctcctggtga - 22321 ttcttcttca ggttggacag ctggtgctgc agcttattat gtgggttatc ttcaacctag - 22381 gacttttcta ttaaaatata atgaaaatgg aaccattaca gatgctgtag actgtgcact - 22441 tgaccctctc tcagaaacaa agtgtacgtt gaaatccttc actgtagaaa aaggaatcta - 22501 tcaaacttct aactttagag tccaaccaac agaatctatt gttagatttc ctaatattac - 22561 aaacttgtgc ccttttggtg aagtttttaa cgccaccaga tttgcatctg tttatgcttg - 22621 gaacaggaag agaatcagca actgtgttgc tgattattct gtcctatata attccgcatc - 22681 attttccact tttaagtgtt atggagtgtc tcctactaaa ttaaatgatc tctgctttac - 22741 taatgtctat gcagattcat ttgtaattag aggtgatgaa gtcagacaaa tcgctccagg - 22801 gcaaactgga aagattgctg attataatta taaattacca gatgatttta caggctgcgt - 22861 tatagcttgg aattctaaca atcttgattc taaggttggt ggtaattata attacctgta - 22921 tagattgttt aggaagtcta atctcaaacc ttttgagaga gatatttcaa ctgaaatcta - 22981 tcaggccggt agcacacctt gtaatggtgt tgaaggtttt aattgttact ttcctttaca - 23041 atcatatggt ttccaaccca ctaatggtgt tggttaccaa ccatacagag tagtagtact - 23101 ttcttttgaa cttctacatg caccagcaac tgtttgtgga cctaaaaagt ctactaattt - 23161 ggttaaaaac aaatgtgtca atttcaactt caatggttta acaggcacag gtgttcttac - 23221 tgagtctaac aaaaagtttc tgcctttcca acaatttggc agagacattg ctgacactac - 23281 tgatgctgtc cgtgatccac agacacttga gattcttgac attacaccat gttcttttgg - 23341 tggtgtcagt gttataacac caggaacaaa tacttctaac caggttgctg ttctttatca - 23401 ggatgttaac tgcacagaag tccctgttgc tattcatgca gatcaactta ctcctacttg - 23461 gcgtgtttat tctacaggtt ctaatgtttt tcaaacacgt gcaggctgtt taataggggc - 23521 tgaacatgtc aacaactcat atgagtgtga catacccatt ggtgcaggta tatgcgctag - 23581 ttatcagact cagactaatt ctcctcggcg ggcacgtagt gtagctagtc aatccatcat - 23641 tgcctacact atgtcacttg gtgcagaaaa ttcagttgct tactctaata actctattgc - 23701 catacccaca aattttacta ttagtgttac cacagaaatt ctaccagtgt ctatgaccaa - 23761 gacatcagta gattgtacaa tgtacatttg tggtgattca actgaatgca gcaatctttt - 23821 gttgcaatat ggcagttttt gtacacaatt aaaccgtgct ttaactggaa tagctgttga - 23881 acaagacaaa aacacccaag aagtttttgc acaagtcaaa caaatttaca aaacaccacc - 23941 aattaaagat tttggtggtt ttaatttttc acaaatatta ccagatccat caaaaccaag - 24001 caagaggtca tttattgaag atctactttt caacaaagtg acacttgcag atgctggctt - 24061 catcaaacaa tatggtgatt gccttggtga tattgctgct agagacctca tttgtgcaca - 24121 aaagtttaac ggccttactg ttttgccacc tttgctcaca gatgaaatga ttgctcaata - 24181 cacttctgca ctgttagcgg gtacaatcac ttctggttgg acctttggtg caggtgctgc - 24241 attacaaata ccatttgcta tgcaaatggc ttataggttt aatggtattg gagttacaca - 24301 gaatgttctc tatgagaacc aaaaattgat tgccaaccaa tttaatagtg ctattggcaa - 24361 aattcaagac tcactttctt ccacagcaag tgcacttgga aaacttcaag atgtggtcaa - 24421 ccaaaatgca caagctttaa acacgcttgt taaacaactt agctccaatt ttggtgcaat - 24481 ttcaagtgtt ttaaatgata tcctttcacg tcttgacaaa gttgaggctg aagtgcaaat - 24541 tgataggttg atcacaggca gacttcaaag tttgcagaca tatgtgactc aacaattaat - 24601 tagagctgca gaaatcagag cttctgctaa tcttgctgct actaaaatgt cagagtgtgt - 24661 acttggacaa tcaaaaagag ttgatttttg tggaaagggc tatcatctta tgtccttccc - 24721 tcagtcagca cctcatggtg tagtcttctt gcatgtgact tatgtccctg cacaagaaaa - 24781 gaacttcaca actgctcctg ccatttgtca tgatggaaaa gcacactttc ctcgtgaagg - 24841 tgtctttgtt tcaaatggca cacactggtt tgtaacacaa aggaattttt atgaaccaca - 24901 aatcattact acagacaaca catttgtgtc tggtaactgt gatgttgtaa taggaattgt - 24961 caacaacaca gtttatgatc ctttgcaacc tgaattagac tcattcaagg aggagttaga - 25021 taaatatttt aagaatcata catcaccaga tgttgattta ggtgacatct ctggcattaa - 25081 tgcttcagtt gtaaacattc aaaaagaaat tgaccgcctc aatgaggttg ccaagaattt - 25141 aaatgaatct ctcatcgatc tccaagaact tggaaagtat gagcagtata taaaatggcc - 25201 atggtacatt tggctaggtt ttatagctgg cttgattgcc atagtaatgg tgacaattat - 25261 gctttgctgt atgaccagtt gctgtagttg tctcaagggc tgttgttctt gtggatcctg - 25321 ctgcaaattt gatgaagacg actctgagcc agtgctcaaa ggagtcaaat tacattacac - 25381 ataaacgaac ttatggattt gtttatgaga atcttcacaa ttggaactgt aactttgaag - 25441 caaggtgaaa tcaaggatgc tactccttca gattttgttc gcgctactgc aacgataccg - 25501 atacaagcct cactcccttt cggatggctt attgttggcg ttgcacttct tgctgttttt - 25561 cagagcgctt ccaaaatcat aaccctcaaa aagagatggc aactagcact ctccaagggt - 25621 gttcactttg tttgcaactt gctgttgttg tttgtaacag tttactcaca ccttttgctc - 25681 gttgctgctg gccttgaagc cccttttctc tatctttatg ctttagtcta cttcttgcag - 25741 agtataaact ttgtaagaat aataatgagg ctttggcttt gctggaaatg ccgttccaaa - 25801 aacccattac tttatgatgc caactatttt ctttgctggc atactaattg ttacgactat - 25861 tgtatacctt acaatagtgt aacttcttca attgtcatta cttcaggtga tggcacaaca - 25921 agtcctattt ctgaacatga ctaccagatt ggtggttata ctgaaaaatg ggaatctgga - 25981 gtaaaagact gtgttgtatt acacagttac ttcacttcag actattacca gctgtactca - 26041 actcaattga gtacagacac tggtgttgaa catgttacct tcttcatcta caataaaatt - 26101 gttgatgagc ctgaagaaca tgtccaaatt cacacaatcg acggttcatc cggagttgtt - 26161 aatccagtaa tggaaccaat ttatgatgaa ccgacgacga ctactagcgt gcctttgtaa - 26221 gcacaagctg atgagtacga acttatgtac tcattcgttt cggaagagac aggtacgtta - 26281 atagttaata gcgtacttct ttttcttgct ttcgtggtat tcttgctagt tacactagcc - 26341 atccttactg cgcttcgatt gtgtgcgtac tgctgcaata ttgttaacgt gagtcttgta - 26401 aaaccttctt tttacgttta ctctcgtgtt aaaaatctga attcttctag agttcctgat - 26461 cttctggtct aaacgaacta aatattatat tagtttttct gtttggaact ttaattttag - 26521 ccatggcaga ttccaacggt actattaccg ttgaagagct taaaaagctc cttgaacaat - 26581 ggaacctagt aataggtttc ctattcctta catggatttg tcttctacaa tttgcctatg - 26641 ccaacaggaa taggtttttg tatataatta agttaatttt cctctggctg ttatggccag - 26701 taactttagc ttgttttgtg cttgctgctg tttacagaat aaattggatc accggtggaa - 26761 ttgctatcgc aatggcttgt cttgtaggct tgatgtggct cagctacttc attgcttctt - 26821 tcagactgtt tgcgcgtacg cgttccatgt ggtcattcaa tccagaaact aacattcttc - 26881 tcaacgtgcc actccatggc actattctga ccagaccgct tctagaaagt gaactcgtaa - 26941 tcggagctgt gatccttcgt ggacatcttc gtattgctgg acaccatcta ggacgctgtg - 27001 acatcaagga cctgcctaaa gaaatcactg ttgctacatc acgaacgctt tcttattaca - 27061 aattgggagc ttcgcagcgt gtagcaggtg actcaggttt tgctgcatac agtcgctaca - 27121 ggattggcaa ctataaatta aacacagacc attccagtag cagtgacaat attgctttgc - 27181 ttgtacagta agtgacaaca gatgtttcat ctcgttgact ttcaggttac tatagcagag - 27241 atattactaa ttattatgag gacttttaaa gtttccattt ggaatcttga ttacatcata - 27301 aacctcataa ttaaaaattt atctaagtca ctaactgaga ataaatattc tcaattagat - 27361 gaagagcaac caatggagat tgattaaacg aacatgaaaa ttattctttt cttggcactg - 27421 ataacactcg ctacttgtga gctttatcac taccaagagt gtgttagagg tacaacagta - 27481 cttttaaaag aaccttgctc ttctggaaca tacgagggca attcaccatt tcatcctcta - 27541 gctgataaca aatttgcact gacttgcttt agcactcaat ttgcttttgc ttgtcctgac - 27601 ggcgtaaaac acgtctatca gttacgtgcc agatcagttt cacctaaact gttcatcaga - 27661 caagaggaag ttcaagaact ttactctcca atttttctta ttgttgcggc aatagtgttt - 27721 ataacacttt gcttcacact caaaagaaag acagaatgat tgaactttca ttaattgact - 27781 tctatttgtg ctttttagcc tttctgctat tccttgtttt aattatgctt attatctttt - 27841 ggttctcact tgaactgcaa gatcataatg aaacttgtca cgcctaaacg aacatgaaat - 27901 ttcttgtttt cttaggaatc atcacaactg tagctgcatt tcaccaagaa tgtagtttac - 27961 agtcatgtac tcaacatcaa ccatatgtag ttgatgaccc gtgtcctatt cacttctatt - 28021 ctaaatggta tattagagta ggagctagaa aatcagcacc tttaattgaa ttgtgcgtgg - 28081 atgaggctgg ttctaaatca cccattcagt acatcgatat cggtaattat acagtttcct - 28141 gtttaccttt tacaattaat tgccaggaac ctaaattggg tagtcttgta gtgcgttgtt - 28201 cgttctatga agacttttta gagtatcatg acgttcgtgt tgttttagat ttcatctaaa - 28261 cgaacaaact aaaatgtctg ataatggacc ccaaaatcag cgaaatgcac cccgcattac - 28321 gtttggtgga ccctcagatt caactggcag taaccagaat ggagaacgca gtggggcgcg - 28381 atcaaaacaa cgtcggcccc aaggtttacc caataatact gcgtcttggt tcaccgctct - 28441 cactcaacat ggcaaggaag accttaaatt ccctcgagga caaggcgttc caattaacac - 28501 caatagcagt ccagatgacc aaattggcta ctaccgaaga gctaccagac gaattcgtgg - 28561 tggtgacggt aaaatgaaag atctcagtcc aagatggtat ttctactacc taggaactgg - 28621 gccagaagct ggacttccct atggtgctaa caaagacggc atcatatggg ttgcaactga - 28681 gggagccttg aatacaccaa aagatcacat tggcacccgc aatcctgcta acaatgctgc - 28741 aatcgtgcta caacttcctc aaggaacaac attgccaaaa ggcttctacg cagaagggag - 28801 cagaggcggc agtcaagcct cttctcgttc ctcatcacgt agtcgcaaca gttcaagaaa - 28861 ttcaactcca ggcagcagta ggggaacttc tcctgctaga atggctggca atggcggtga - 28921 tgctgctctt gctttgctgc tgcttgacag attgaaccag cttgagagca aaatgtctgg - 28981 taaaggccaa caacaacaag gccaaactgt cactaagaaa tctgctgctg aggcttctaa - 29041 gaagcctcgg caaaaacgta ctgccactaa agcatacaat gtaacacaag ctttcggcag - 29101 acgtggtcca gaacaaaccc aaggaaattt tggggaccag gaactaatca gacaaggaac - 29161 tgattacaaa cattggccgc aaattgcaca atttgccccc agcgcttcag cgttcttcgg - 29221 aatgtcgcgc attggcatgg aagtcacacc ttcgggaacg tggttgacct acacaggtgc - 29281 catcaaattg gatgacaaag atccaaattt caaagatcaa gtcattttgc tgaataagca - 29341 tattgacgca tacaaaacat tcccaccaac agagcctaaa aaggacaaaa agaagaaggc - 29401 tgatgaaact caagccttac cgcagagaca gaagaaacag caaactgtga ctcttcttcc - 29461 tgctgcagat ttggatgatt tctccaaaca attgcaacaa tccatgagca gtgctgactc - 29521 aactcaggcc taaactcatg cagaccacac aaggcagatg ggctatataa acgttttcgc - 29581 ttttccgttt acgatatata gtctactctt gtgcagaatg aattctcgta actacatagc - 29641 acaagtagat gtagttaact ttaatctcac atagcaatct ttaatcagtg tgtaacatta - 29701 gggaggactt gaaagagcca ccacattttc accgaggcca cgcggagtac gatcgagtgt - 29761 acagtgaaca atgctaggga gagctgccta tatggaagag ccctaatgtg taaaattaat - 29821 tttagtagtg ctatccccat gtgattttaa tagcttctta ggagaatgac aaaaaaaaaa - 29881 aaaaaaaaaa aaaaaaaaaa aaa -// - diff --git a/workflows/resources/README b/workflows/resources/README deleted file mode 100644 index c26158f..0000000 --- a/workflows/resources/README +++ /dev/null @@ -1,30 +0,0 @@ -Descriptions of the included resources: - -- MN908947.fa -- MN908947.gb - These are the reference genome fasta and genbank files, used to place genomes in reference coordinates for alignment/trimming steps - and for typing variants specified relative to the reference. - -- date_corrections.csv - central_sample_id,sample_date - These overwrite the dates in MAJORA - -- resequencing_omissions.txt - These IDs have been resequenced and should be omitted - -- AAs.csv - AAmutation, nuc position in reference - Specific mutations to look for -- dels.csv - nuc position in reference, length - Specific deletions to look for - -- publish_recipes.json - A file of recipes to make the billions of output files. - Parsed by bin/publish_from_config.py - Each key in the dict represents an outdir - For that outdir, a list of outfiles is given, each represented by a dict - Outfile dicts can include fields: suffix, data, fasta, metadata_fields, where, and mutations - Either data in ["cog", "cog_global"] or fasta in ["unaligned", "aligned", "trimmed", "cog_global"] must be specified - Mutations is a bool, metadata_fields a list and all others are strings - diff --git a/workflows/resources/WH04.fa b/workflows/resources/WH04.fa deleted file mode 100644 index 406a27c..0000000 --- a/workflows/resources/WH04.fa +++ /dev/null @@ -1,2 +0,0 @@ ->Wuhan/WH04/2020 -NNNNNNNNNNNNNNNNTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTTAAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACAACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTACAACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGATATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGTCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAAGAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCAGTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTCTCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACTAAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTCACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAANNNNNNNNNNNNNNNN diff --git a/workflows/resources/date_corrections.csv b/workflows/resources/date_corrections.csv deleted file mode 100644 index 3d68ea2..0000000 --- a/workflows/resources/date_corrections.csv +++ /dev/null @@ -1,11 +0,0 @@ -central_sample_id,sample_date -PHEC-20161,2020-03-04 -PHEC-2018F,2020-03-04 -PHEC-139C3,2020-03-21 -PHEC-153F0,2020-03-25 -PHEC-204C2,2020-03-05 -PHEC-20170,2020-03-04 -PHEC-16043,2020-03-27 -PHEC-14D5D,2020-03-24 -PHEC-14D4E,2020-03-24 -PHEC-13A57,2020-03-21 diff --git a/workflows/resources/dels.csv b/workflows/resources/dels.csv deleted file mode 100644 index 521b179..0000000 --- a/workflows/resources/dels.csv +++ /dev/null @@ -1,2 +0,0 @@ -1605,3 -21765,6 diff --git a/workflows/resources/empty_constellations.csv b/workflows/resources/empty_constellations.csv deleted file mode 100644 index 3c31ff9..0000000 --- a/workflows/resources/empty_constellations.csv +++ /dev/null @@ -1 +0,0 @@ -sequence_name diff --git a/workflows/resources/empty_mutations.csv b/workflows/resources/empty_mutations.csv deleted file mode 100644 index 3c31ff9..0000000 --- a/workflows/resources/empty_mutations.csv +++ /dev/null @@ -1 +0,0 @@ -sequence_name diff --git a/workflows/resources/empty_updown.csv b/workflows/resources/empty_updown.csv deleted file mode 100644 index 0e6ee7b..0000000 --- a/workflows/resources/empty_updown.csv +++ /dev/null @@ -1 +0,0 @@ -query diff --git a/workflows/resources/gisaid_omissions.txt b/workflows/resources/gisaid_omissions.txt deleted file mode 100644 index 03b90b0..0000000 --- a/workflows/resources/gisaid_omissions.txt +++ /dev/null @@ -1,364 +0,0 @@ -# duplicate of Guangdong/20SF012/2020 / EPI_ISL_403932 as described here: -# http://virological.org/t/phylogenetic-analysis-of-23-ncov-2019-genomes-2020-01-23/335/5 -HKU-SZ-002a_2020 - -# erroneous collection date? (day before submission) -Hefei/2/2020|EPI_ISL_412026|China|Anhui|Hefei|2020-02-23 - -# too many errors -Russia/StPetersburg-3524/2020|EPI_ISL_415710|2020-03-15 -Guangzhou/GZMU0047/2020|EPI_ISL_414690|2020-02-25 -Henan/IVDC-HeN-002/2020|EPI_ISL_408487|2020-01-20 -Shandong/LY001/2020|EPI_ISL_414934|2020-01-21 -Shandong/LY002/2020|EPI_ISL_414935|2020-01-21 -Shanghai/IVDC-SH-001/2020|EPI_ISL_408483|2020-01-20 -Shenzhen/SZTH-001/2020|EPI_ISL_406592|2020-01-13 -Shenzhen/SZTH-004/2020|EPI_ISL_406595|2020-01-16 -USA/WA-UW53/2020|EPI_ISL_415618|2020-03-09 -USA/WA-UW65/2020|EPI_ISL_415593|2020-03-10 -Wuhan/HBCDC-HB-04/2019|EPI_ISL_412900|2019-12-30 -Wuhan/WH02/2019|EPI_ISL_406799|2019-12-31 -Belgium/MTR-03026/2020|EPI_ISL_416476|2020-03-02 -Malaysia/MKAK-CL-2020-5049/2020|EPI_ISL_416884|2020-02-20 -Malaysia/188407/2020|EPI_ISL_417918|2020-03-18 -Malaysia/190300/2020|EPI_ISL_417920||Malaysia|Kuala_Lumpur||2020-03-22 -Turkey/6224-Ankara1034/2020|EPI_ISL_417413|2020-03-17 -USA/WI-GMF-00237/2020|EPI_ISL_418189|2020-03-23 -Italy/UniMI02/2020|EPI_ISL_417446|2020-02-24 -Malaysia/186197/2020|EPI_ISL_417919||Malaysia|Kuala_Lumpur||2020-03-14 -USA/MN4-MDH4/2020|EPI_ISL_417189||USA|Minnesota||2020-03-09 -USA/WI-GMF-00227/2020|EPI_ISL_418186||USA|Wisconsin|La_Crosse_County|2020-03-23 -USA/WI-GMF-00049/2020|EPI_ISL_418185||USA|Wisconsin|La_Crosse_County|2020-03-18 -Canada/ON_PHL4232/2020|EPI_ISL_418374||Canada|Ontario||2020-03-11 -Senegal/640/2020|EPI_ISL_420079||Senegal|Mbour||2020-03-20 -Senegal/328/2020|EPI_ISL_420071||Senegal|Mbour||2020-03-17 -USA/WA-UW306/2020|EPI_ISL_418874||USA|Washington||2020-03-23 -Ecuador/HGSQ-USFQ-007/2020 -Ecuador/HGSQ-USFQ-010/2020 -USA/WI-GMF-00928/2020 -USA/WI-GMF-00384/2020 -USA/WA-UW-1821/2020 -USA/WA-UW45/2020 -India/1135/2020 - -# too many singleton mutations -Hong_Kong/case49_VM20002508/2020|EPI_ISL_414567|2020-02-10 -Shenzhen/SZTH-004/2020|EPI_ISL_406595|2020-01-16 -Belgium/VLM-03011/2020|EPI_ISL_415153|2020-03-03 -Belgium/BA-02291/2020|EPI_ISL_415159|2020-02-29 -Belgium/MTR-03021/2020|EPI_ISL_416467|2020-03-02 -Belgium/BA-02291/2020|EPI_ISL_415159|2020-02-29 -Hong_Kong/case49_VM20002508/2020|EPI_ISL_414567|2020-02-10 -USA/WA-UW156/2020|EPI_ISL_416694|2020-03-13 -USA/WA-UW132/2020|EPI_ISL_416670|2020-03-12 -Australia/VIC12/2020|EPI_ISL_416518|2020-03-16 -USA/CA-CDPH-UC28/2020|EPI_ISL_417332|2020-03-14 -USA/WA-UW306/2020|EPI_ISL_418874||USA|Washington||2020-03-23 - -# big indels -USA/WA-UW134/2020|EPI_ISL_416672|2020-03-10 -USA/WA-UW182/2020|EPI_ISL_416720|2020-03-13 -USA/WA-UW281/2020|EPI_ISL_418066|2020-03-14 - -Singapore/14Clin/2020|EPI_ISL_418998|2020-02-14 -Singapore/19/2020|EPI_ISL_419001|2020-03-02 -Singapore/22/2020|EPI_ISL_420099|2020-03-02 -Singapore/23/2020|EPI_ISL_420100|2020-03-02 -Singapore/18/2020|EPI_ISL_418999|2020-03-01 -Singapore/30/2020|EPI_ISL_420107|2020-03-09 -Singapore/12Clin/2020|EPI_ISL_418995|2020-02-28 -Singapore/16/2020|EPI_ISL_418997|2020-02-06 -Singapore/15/2020|EPI_ISL_418996|2020-01-27 -Singapore/21/2020|EPI_ISL_419000|2020-02-13 - -# ORF8 reversion? -Guangdong/GDSZ202013-P0014/2020|EPI_ISL_413865|2020-02-05 -Spain/Andalucia201373/2020|EPI_ISL_418244|2020-03-02 -USA/NY-NYUMC40/2020|EPI_ISL_419701|2020-03-18 - -# replicate sequencing -USA/WA1-A12/2020|EPI_ISL_407214|2020-01-19 -USA/WA1-F6/2020|EPI_ISL_407215|2020-01-19 -USA/WA1-A12/2020|EPI_ISL_407214|2020-01-25 -USA/WA1-F6/2020|EPI_ISL_407215|2020-01-25 -Italy/INMI1-cs/2020|EPI_ISL_410546|2020-01-31 -France/IDF0372-isl/2020|EPI_ISL_410720|2020-01-23 -France/IDF0515-isl/2020|EPI_ISL_410984|2020-01-29 -France/IDF0386/2020|EPI_ISL_411220|2020-01-28 -HKU-SZ-002a|MN938384|2020-01-10 - -# 3x replicate sequencing according to Verity but missing from GISAID now anyway - fixed?: -EPI_ISL_415695 -EPI_ISL_415694 -EPI_ISL_415693 - -# Large deletion in ORF8 - causes misplacement in A/B clade. Remove for now. Give lineage designation and force grouping if more appear? -Singapore/12/2020|EPI_ISL_414378|2020-02-17 -Singapore/13/2020|EPI_ISL_414379|2020-02-18 -Singapore/14/2020|EPI_ISL_414380|2020-02-19 -Taiwan/CGMH-CGU-02/2020|EPI_ISL_417518|2020-02-04 - -# date missing -# commented these out as we should remove using separate filter. -#South_Korea/SNU01/2020 -#Wuhan/HBCDC-HB-04/2019|EPI_ISL_412900|China|Hubei|Wuhan|2019-12-30 -#Netherlands/Coevorden_1363618/2020|Netherlands|Coevorden||2020 -#Netherlands/Tilburg_/2020|Netherlands|Tilburg||2020 -#China/WF0001/2020|EPI_ISL_413691|China|Shandong||2020-01 -#China/WF0002/2020|EPI_ISL_413692|China|Shandong||2020-01 -#China/WF0003/2020|EPI_ISL_413693|China|Shandong||2020-01 -#China/WF0004/2020|EPI_ISL_413694|China|Shandong||2020-01 -#China/WF0012/2020|EPI_ISL_413697|China|Shandong||2020-02 -#China/WF0014/2020|EPI_ISL_413711|China|Shandong||2020-02 -#China/WF0015/2020|EPI_ISL_413729|China|Shandong||2020-02 -#China/WF0016/2020|EPI_ISL_413746|China|Shandong||2020-02 -#China/WF0018/2020|EPI_ISL_413748|China|Shandong||2020-02 -#China/WF0019/2020|EPI_ISL_413749|China|Shandong||2020-02 -#China/WF0020/2020|EPI_ISL_413750|China|Shandong||2020-02 -#China/WF0021/2020|EPI_ISL_413751|China|Shandong||2020-02 -#China/WF0023/2020|EPI_ISL_413752|China|Shandong||2020-02 -#China/WF0024/2020|EPI_ISL_413753|China|Shandong||2020-02 -#China/WF0026/2020|EPI_ISL_413761|China|Shandong||2020-02 -#China/WF0028/2020|EPI_ISL_413791|China|Shandong||2020-02 -#China/WF0029/2020|EPI_ISL_413809|China|Shandong||2020-02 -#Japan/TKYE6182/2020|EPI_ISL_414511|2020-01 -#Netherlands/NoordBrabant_33/2020|EPI_ISL_414542|2020-03 -#Netherlands/NoordBrabant_34/2020|EPI_ISL_414543|2020-03 -#Canada/ON_PHL2223/2020|EPI_ISL_418381|2020 -#Canada/ON_PHL2259/2020|EPI_ISL_418344|2020 -#Canada/ON_PHL2273/2020|EPI_ISL_418383|2020 -#Canada/ON_PHL2294/2020|EPI_ISL_418384|2020 -#Canada/ON_PHL3476/2020|EPI_ISL_418380|2020 -#Canada/ON_PHL5930/2020|EPI_ISL_418382|2020 -#Netherlands/Limburg_7/2020|EPI_ISL_415464|2020 -#Netherlands/NA_4/2020|EPI_ISL_415493|2020 -#Netherlands/NA_5/2020|EPI_ISL_415494|2020 -#Netherlands/NoordBrabant_41/2020|EPI_ISL_415499|2020 -#Netherlands/NoordBrabant_51/2020|EPI_ISL_415507|2020 -#Netherlands/NoordBrabant_53/2020|EPI_ISL_415509|2020 -#Netherlands/NoordBrabant_61/2020|EPI_ISL_415517|2020 -#Netherlands/NoordBrabant_62/2020|EPI_ISL_415518|2020 -#Netherlands/NoordBrabant_63/2020|EPI_ISL_415519|2020 -#Netherlands/NoordBrabant_64/2020|EPI_ISL_415520|2020 -#Netherlands/NoordBrabant_65/2020|EPI_ISL_415521|2020 -#Netherlands/NoordBrabant_66/2020|EPI_ISL_415522|2020 -#Netherlands/NoordBrabant_67/2020|EPI_ISL_415523|2020 -#Spain/Cataluna201396/2020|EPI_ISL_418250|2020 -#USA/CA-CDPH-UC1/2020|EPI_ISL_413557|2020 -#Czech_Republic/ChVir1630/2020|EPI_ISL_416742|2020-02 -#Czech_Republic/ChVir1912/2020|EPI_ISL_416743|2020-03 -#Lithuania/ChVir1632/2020|EPI_ISL_416741|2020-02 - -# withdrawn (yes, not in gisaid dump 2020-03-24)? -China/Spain-cluster-case2/2020|EPI_ISL_415046|2020-03-11 -China/Spain-cluster-case3/2020|EPI_ISL_415047|2020-03-11 -China/Spain-cluster-case1/2020|EPI_ISL_415045|2020-03-11 - -# environmental samples -Wuhan/IVDC-HB-envF13-20/2020|EPI_ISL_408514|2020-01-01 -Wuhan/IVDC-HB-envF13-21/2020|EPI_ISL_408515|2020-01-01 - -# pangolin/bat -pangolin/Guangdong/P2S/2019|EPI_ISL_410544|2019 -pangolin/Guangxi/P1E/2017|EPI_ISL_410539|2017 -pangolin/Guangxi/P4L/2017|EPI_ISL_410538|2017 -pangolin/Guangxi/P3B/2017|EPI_ISL_410543|2017 -pangolin/Guangxi/P2V/2017|EPI_ISL_410542|2017 -pangolin/Guangxi/P5E/2017|EPI_ISL_410541|2017 -pangolin/Guangxi/P5L/2017|EPI_ISL_410540|2017 -pangolin/Guangdong/1/2019|EPI_ISL_410721|2019 -bat/Yunnan/RaTG13/2013|EPI_ISL_402131|2013-07-24 -pangolin/China/MP789/2019|EPI_ISL_412860|2019-03-19 - -# washington state oversampling -USA/WA-NH20/2020|EPI_ISL_418787||USA|Washington||2020-03-13 -USA/WA-S79/2020|EPI_ISL_417132||USA|Washington|King_County|2020-03-05 -USA/WA4-UW2/2020|EPI_ISL_413455||USA|Washington||2020-02-28 -USA/WA-S122/2020|EPI_ISL_417175||USA|Washington|Grant_County|2020-03-02 -USA/WA-UW150/2020|EPI_ISL_416688||USA|Washington||2020-03-14 -USA/WA-UW376/2020|EPI_ISL_418943||USA|Washington||2020-03-17 -USA/WA-UW18/2020|EPI_ISL_414366||USA|Washington||2020-03-05 -USA/UNKNOWN-UW276/2020|EPI_ISL_418061||USA|||2020-03-13 -USA/WA-UW364/2020|EPI_ISL_418931||USA|Washington||2020-03-17 -USA/WA-UW103/2020|EPI_ISL_416641||USA|Washington||2020-03-11 -USA/WA-S24/2020|EPI_ISL_417077||USA|Washington|Snohomish_County|2020-03-02 -USA/WA-S6/2020|EPI_ISL_416461||USA|Washington|King_County|2020-02-29 -USA/WA-NH23/2020|EPI_ISL_418790||USA|Washington||2020-03-13 -USA/WA-UW48/2020|EPI_ISL_415613||USA|Washington||2020-03-09 -USA/WA-UW274/2020|EPI_ISL_418059||USA|||2020-03-13 -USA/WA-S108/2020|EPI_ISL_417161||USA|Washington|King_County|2020-02-29 -USA/WA-UW86/2020|EPI_ISL_416442||USA|Washington||2020-03-10 -USA/WA-S99/2020|EPI_ISL_417152||USA|Washington|King_County|2020-02-28 -USA/WA-UW89/2020|EPI_ISL_416445||USA|Washington||2020-03-10 -USA/WA-S21/2020|EPI_ISL_417074||USA|Washington|King_County|2020-03-02 -USA/WA-S61/2020|EPI_ISL_417114||USA|Washington|King_County|2020-03-05 -USA/WA-UW63/2020|EPI_ISL_415591||USA|Washington||2020-03-10 -USA/OR-UW54/2020|EPI_ISL_415619||USA|Oregon||2020-03-09 -USA/WA-NH19/2020|EPI_ISL_418786||USA|Washington||2020-03-13 -USA/WA-S13/2020|EPI_ISL_417066||USA|Washington|King_County|2020-03-03 -USA/WA-S10/2020|EPI_ISL_416465||USA|Washington|King_County|2020-02-29 -Canada/BC_3989992/2020|EPI_ISL_418823||Canada|British_Columbia||2020-03-09 -USA/WA-UW23/2020|EPI_ISL_414592||USA|Washington|Tacoma|2020-03-06 -USA/WA-S45/2020|EPI_ISL_417098||USA|Washington||2020-02-29 -USA/WA6-UW3/2020|EPI_ISL_413457||USA|Washington||2020-02-29 -USA/WA-S38/2020|EPI_ISL_417091||USA|Washington|Snohomish_County|2020-03-04 -USA/WA-NH22/2020|EPI_ISL_418789||USA|Washington||2020-03-13 -USA/WA-S2/2020|EPI_ISL_413456||USA|Washington|King_County|2020-02-20 -USA/WA-UW84/2020|EPI_ISL_416440||USA|Washington||2020-03-10 -USA/WA-UW51/2020|EPI_ISL_415616||USA|Washington||2020-03-08 -USA/WA-S3/2020|EPI_ISL_413560||USA|Washington||2020-02-28 -USA/WI-76/2020|EPI_ISL_421334||USA|Wisconsin||2020-03-22 -USA/WA-S82/2020|EPI_ISL_417135||USA|Washington|King_County|2020-02-22 -USA/WA-UW262/2020|EPI_ISL_418047||USA|Washington||2020-03-16 -USA/WA-S88/2020|EPI_ISL_417141||USA|Washington|King_County|2020-03-01 -USA/WA-UW100/2020|EPI_ISL_416638||USA|Washington||2020-03-12 -USA/WA-S65/2020|EPI_ISL_417118||USA|Washington|King_County|2020-03-03 -USA/WI-23/2020|EPI_ISL_417507||USA|Wisconsin||2020-03-17 -USA/WA-UW190/2020|EPI_ISL_416728||USA|Washington||2020-03-13 -USA/VA-DCLS-0011/2020|EPI_ISL_419263||USA|Virginia||2020-03-10 -USA/WA-NH11/2020|EPI_ISL_418780||USA|Washington||2020-03-13 -USA/WA-UW128/2020|EPI_ISL_416666||USA|Washington||2020-03-12 -USA/WA-S52/2020|EPI_ISL_417105||USA|Washington|Snohomish_County|2020-03-03 -USA/WA-NH6/2020|EPI_ISL_418775||USA|Washington||2020-03-13 -USA/WA-S118/2020|EPI_ISL_417171||USA|Washington|King_County|2020-03-01 -USA/WA-S51/2020|EPI_ISL_417104||USA|Washington||2020-03-03 -USA/WA-S98/2020|EPI_ISL_417151||USA|Washington|King_County|2020-02-29 -USA/WA-S63/2020|EPI_ISL_417116||USA|Washington|King_County|2020-03-04 -USA/WA-UW147/2020|EPI_ISL_416685||USA|Washington||2020-03-15 -USA/WA-S39/2020|EPI_ISL_417092||USA|Washington|Snohomish_County|2020-03-04 -USA/WA-UW52/2020|EPI_ISL_415617||USA|Washington||2020-03-09 -USA/WA-S29/2020|EPI_ISL_417082||USA|Washington|Grant_County|2020-03-02 -USA/WA-S26/2020|EPI_ISL_417079||USA|Washington|Snohomish_County|2020-03-02 -USA/UPHL-04/2020|EPI_ISL_415542||USA|Utah||2020-03-13 -USA/WA18-UW14/2020|EPI_ISL_413653||USA|Washington||2020-03-05 -USA/WA-UW177/2020|EPI_ISL_416715||USA|Washington||2020-03-15 -USA/WA-S36/2020|EPI_ISL_417089||USA|Washington||2020-03-02 -USA/WA-S84/2020|EPI_ISL_417137||USA|Washington|King_County|2020-02-21 -USA/WA-S16/2020|EPI_ISL_417069||USA|Washington||2020-03-03 -USA/WA-UW285/2020|EPI_ISL_418070||USA|Washington||2020-03-14 -USA/WA-S93/2020|EPI_ISL_417146||USA|Washington|King_County|2020-02-29 -USA/WA-UW369/2020|EPI_ISL_418936||USA|Washington||2020-03-17 -USA/WA-S46/2020|EPI_ISL_417099||USA|Washington||2020-02-29 -USA/WA-UW118/2020|EPI_ISL_416656||USA|Washington||2020-03-11 -USA/WA-S53/2020|EPI_ISL_417106||USA|Washington|Snohomish_County|2020-03-03 -USA/WA-S71/2020|EPI_ISL_417124||USA|Washington|King_County|2020-03-05 -USA/WA-UW167/2020|EPI_ISL_416705||USA|Washington||2020-03-13 -USA/WA-UW33/2020|EPI_ISL_414620||USA|||2020-03-08 -USA/VA-DCLS-0019/2020|EPI_ISL_420029||USA|Virginia||2020-03-11 -USA/VA-DCLS-0018/2020|EPI_ISL_420028||USA|Virginia||2020-03-11 -USA/WA-UW179/2020|EPI_ISL_416717||USA|Washington||2020-03-15 -USA/WA-NH7/2020|EPI_ISL_418776||USA|Washington||2020-03-13 -USA/WA-S75/2020|EPI_ISL_417128||USA|Washington|King_County|2020-03-05 -USA/WA-UW95/2020|EPI_ISL_416451||USA|Washington||2020-03-10 -USA/WA-S57/2020|EPI_ISL_417110||USA|Washington||2020-03-03 -USA/WA-S117/2020|EPI_ISL_417170||USA|Washington||2020-03-02 -USA/WA-UW298/2020|EPI_ISL_418866||USA|Washington||2020-03-13 -USA/WA-S92/2020|EPI_ISL_417145||USA|Washington|King_County|2020-02-29 -USA/WA-S59/2020|EPI_ISL_417112||USA|Washington|King_County|2020-03-02 -USA/WA-S12/2020|EPI_ISL_417065||USA|Washington||2020-03-03 -USA/WA-UW183/2020|EPI_ISL_416721||USA|Washington||2020-03-13 -USA/WA-S74/2020|EPI_ISL_417127||USA|Washington|King_County|2020-03-05 -USA/WA-S15/2020|EPI_ISL_417068||USA|Washington|King_County|2020-03-02 -USA/WA-S43/2020|EPI_ISL_417096||USA|Washington||2020-02-27 -USA/WA-UW70/2020|EPI_ISL_415598||USA|Washington||2020-03-10 -USA/WA-UW99/2020|EPI_ISL_416637||USA|Washington||2020-03-12 -USA/WA-S66/2020|EPI_ISL_417119||USA|Washington|Snohomish_County|2020-03-06 -USA/WA-UW222/2020|EPI_ISL_417371||USA|Washington||2020-03-13 -USA/WA-UW343/2020|EPI_ISL_418910||USA|Washington||2020-03-16 -USA/WA-S111/2020|EPI_ISL_417164||USA|Washington|King_County|2020-03-07 -USA/WA-S102/2020|EPI_ISL_417155||USA|Washington|King_County|2020-02-28 -USA/WA-NH14/2020|EPI_ISL_418783||USA|Washington||2020-03-13 -USA/WA-NH21/2020|EPI_ISL_418788||USA|Washington||2020-03-13 -USA/WA-UW119/2020|EPI_ISL_416657||USA|Washington||2020-03-11 -USA/WA-S91/2020|EPI_ISL_417144||USA|Washington|Snohomish_County|2020-03-02 -USA/WA-S103/2020|EPI_ISL_417156||USA|Washington|King_County|2020-02-28 -USA/WA-UW345/2020|EPI_ISL_418912||USA|Washington||2020-03-16 -USA/WA-UW49/2020|EPI_ISL_415614||USA|Washington||2020-03-09 -USA/WA-UW300/2020|EPI_ISL_418868||USA|Washington||2020-03-13 -USA/WA-UW121/2020|EPI_ISL_416659||USA|Washington||2020-03-11 -USA/WA-UW105/2020|EPI_ISL_416643||USA|Washington||2020-03-11 -USA/WA-S72/2020|EPI_ISL_417125||USA|Washington|King_County|2020-03-06 -USA/WA-UW124/2020|EPI_ISL_416662||USA|Washington||2020-03-12 -USA/VA-DCLS-0017/2020|EPI_ISL_419711||Virginia|||2020-03-11 -USA/WA-UW125/2020|EPI_ISL_416663||USA|Washington||2020-03-12 -USA/WA-S69/2020|EPI_ISL_417122||USA|Washington|King_County|2020-03-05 -USA/WA-UW24/2020|EPI_ISL_414593||USA|Washington|Kirkland|2020-03-05 -USA/WA-UW77/2020|EPI_ISL_416433||USA|Washington||2020-03-10 -USA/WA-NH24/2020|EPI_ISL_418791||USA|Washington||2020-03-13 -USA/WA-S5/2020|EPI_ISL_416460||USA|Washington|King_County|2020-02-29 -USA/WA-S100/2020|EPI_ISL_417153||USA|Washington|King_County|2020-02-29 -USA/WA-UW92/2020|EPI_ISL_416448||USA|Washington||2020-03-11 -USA/WA-UW152/2020|EPI_ISL_416690||USA|Washington||2020-03-13 -USA/WA-UW209/2020|EPI_ISL_417358||USA|Washington||2020-03-13 -USA/WA-S105/2020|EPI_ISL_417158||USA|Washington|King_County|2020-02-28 -USA/WA-S44/2020|EPI_ISL_417097||USA|Washington||2020-02-28 -USA/WA-S101/2020|EPI_ISL_417154||USA|Washington|King_County|2020-02-28 -USA/WA-UW67/2020|EPI_ISL_415595||USA|Washington||2020-03-09 -USA/WA-S109/2020|EPI_ISL_417162||USA|Washington|Snohomish_County|2020-03-01 -USA/WA-UW299/2020|EPI_ISL_418867||USA|Washington||2020-03-13 -USA/UPHL-03/2020|EPI_ISL_415541||USA|Utah||2020-03-13 -USA/WA-S116/2020|EPI_ISL_417169||USA|Washington||2020-03-02 -USA/WA-UW20/2020|EPI_ISL_414368||USA|Washington||2020-03-05 - -#cluster of mutations -Luxembourg/LNS8746229/2020 -USA/UN-UW-1402/2020 -USA/WA-UW-4118/2020 -USA/UN-UW-1486/2020 -Beijing/IVDC-BJ-005/2020 -Ecuador/HGSQ-USFQ-007/2020 -Ecuador/HGSQ-USFQ-010/2020 -USA/WI-GMF-00384/2020 -USA/WA-UW-4130/2020 -USA/WI-GMF-00928/2020 -USA/WA-UW-1572/2020 -USA/ID-UW-4100/2020 -USA/WA-UW-2105/2020 -Scotland/EDB146/2020 - -#ambiguous B mutation T8783C -Australia/VIC721/2020 -Australia/WA04/2020 -USA/IL1/2020 -USA/WA-S25/2020 -USA/WA-S35/2020 -USA/WA-S56/2020 - -#ambiguous B mutation C28144T -USA/WA-S85/2020 -USA/WI-47/2020 - -#reverted B mutation C28144T -India/763/2020 -India/770/2020 -Japan/DP0690/2020 - -#reverted B.1 mutation A23404G? -Scotland/EDB023/2020 - -#3rd base for B.1 mutation C3038T -Australia/NSW22/2020 - -# possible duplicate entry -EPI_ISL_437437 - -# had ambiguous secondary alignment after mapping -Saudi_Arabia/KAUST-Makkah155/2020 - -# outlier on tree -South_Africa/R05475/2020 - -# date before its time -India/NCDC-3175/2020 -Taiwan/CGMH-CGU-22/2020 -Taiwan/CGMH-CGU-23/2020 -Taiwan/CGMH-CGU-24/2020 -Taiwan/CGMH-CGU-25/2020 - -#lots of mutations -Indonesia/EJ-ITD853Sp/2020 -Taiwan/TSGH-20/2020 -South_Africa/R02606/2020 -France/OCC-15/2020 -USA/WA-UW-4749/2020 -India/NCDC-3985/2020 diff --git a/workflows/resources/publish_cog_global_recipes.json b/workflows/resources/publish_cog_global_recipes.json deleted file mode 100644 index a5a63d2..0000000 --- a/workflows/resources/publish_cog_global_recipes.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "alignments": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned", - "metadata_fields": ["country","adm1","adm2","outer_postcode","biosample_source_id","source_id","central_sample_id","collected_by","collection_date","end_time","flowcell_id","flowcell_type","instrument_make","instrument_model","is_surveillance","layout_insert_length","layout_read_length","library_adaptor_barcode","library_layout_config","library_name","library_primers","library_protocol","library_selection","library_seq_kit","library_seq_protocol","library_source","library_strategy","meta.artic.primers","meta.artic.protocol","meta.epi.cluster","meta.investigation.cluster","meta.investigation.name","meta.investigation.site","metric.ct.1.ct_value","metric.ct.1.test_kit","metric.ct.1.test_platform","metric.ct.1.test_target","metric.ct.2.ct_value","metric.ct.2.test_kit","metric.ct.2.test_platform","metric.ct.2.test_target","metric.ct.max_ct","metric.ct.min_ct","metric.ct.num_tests","published_as","received_date","root_sample_id","run_group","run_name","sample_type_collected","sample_type_received","sequencing_org","sequencing_org_code","sequencing_submission_date","sequencing_uuid","source_age","source_sex","start_time","submission_org","submission_org_code","submission_user","swab_site","header","sequence_name","unmapped_genome_completeness","cov_id","sample_date","why_excluded","epi_week", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - }, - { - "suffix": "all", - "data": "cog", - "fasta": "aligned" - }, - { - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "source_id","sample_date", "epi_week", "country", "adm1", "adm2", "outer_postcode", "is_surveillance", "is_community", "is_hcw", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "epi_week=edin_epi_week country=adm0 outer_postcode=adm2_private adm1=adm1_raw adm2=adm2_raw" - } - ], - "metadata": [ - { - "suffix": "public", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id adm1=adm1_UK" - }, - { - "suffix": "consortium", - "data": "cog_global", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","source_id","sample_date","received_date", "collection_date", "published_date","epi_week","sequencing_org_code","submission_org_code","submission_user","root_sample_id","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target","collection_pillar", "is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id", - "mutations": true - }, - { - "suffix": "geography", - "data": "cog_global", - "metadata_fields": ["cog_id","sequence_name","sample_date","epi_week","country","adm1","adm2","utla", "utla_code","outer_postcode","adm1_raw","adm2_raw","adm2_source","suggested_adm2_grouping","NUTS1","region","latitude","longitude","location"], - "where": "cog_id=central_sample_id" - }, - { - "suffix": "mutations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], - "mutations": true - }, - { - "suffix": "constellations", - "data": "cog_global", - "metadata_fields": ["sequence_name", "sample_date", "lineage","lineages_version", "usher_lineage", "usher_lineages_version"], - "constellations": true - }, - { - "suffix": "unlinked", - "data": "cog_global", - "metadata_fields": ["sequence_name", "safe_sample_date", "epi_week", "safe_location","lineage","lineages_version","usher_lineage", "usher_lineages_version", "is_surveillance", "collection_pillar", "is_pillar_2"], - "mutations": true, - "uk_only": true, - "shuffle": true, - "drop_index": "sequence_name" - }, - { - "data": "cog_global", - "suffix": "epidemiology", - "metadata_fields": ["sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date", "received_date", "sequencing_submission_date", "sequencing_org_code", "root_sample_id", "biosample_source_id", "country", "adm1", "adm2", "utla", "utla_code", "outer_postcode", "NUTS1", "latitude", "longitude", "location", "source_age", "source_sex", "collection_pillar", "is_pillar_2", "is_surveillance", "is_travel_history", "travel_history", "lineage", "lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id" - } - ], - "public": [ - { - "suffix": "all", - "data": "cog", - "fasta": "unaligned" - }, - { - "data": "cog", - "fasta": "trimmed", - "metadata_fields": ["sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version","usher_lineage", "usher_lineages_version", "lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict"], - "mutations": true, - "where": "epi_week=edin_epi_week country=adm0 adm1=adm1_raw" - }, - { - "suffix": "unmasked", - "data": "cog", - "fasta": "aligned" - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "cog_global", - "metadata_fields": ["sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], - "where": "gisaid_id=covv_accession_id cog_id=central_sample_id epi_week=edin_epi_week adm1=adm1_UK" - }, - { - "suffix": "mutations", - "data": "cog_global", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} diff --git a/workflows/resources/publish_gisaid_recipes.json b/workflows/resources/publish_gisaid_recipes.json deleted file mode 100644 index 69c88da..0000000 --- a/workflows/resources/publish_gisaid_recipes.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "gisaid": [ - { - "suffix": "all", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week country=edin_admin_0" - }, - { - "suffix": "global", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","country","edin_admin_1","edin_admin_2","edin_travel","edin_date_stamp","sample_date","safe_sample_date","epi_week","epi_day","lineage","lineages_version","lineage_conflict","lineage_ambiguity_score","scorpio_call","scorpio_support","scorpio_conflict","usher_lineage", "usher_lineages_version","covv_accession_id","covv_virus_name","covv_location","covv_add_host_info","covv_assembly_method","covv_gender","covv_host","covv_passage","covv_patient_age","covv_seq_technology","covv_specimen","covv_subm_date","is_uk","is_cog_uk","why_excluded","nucleotide_mutations"], - "mutations": true, - "exclude_cog": true, - "where": "sample_date=covv_collection_date epi_week=edin_epi_week epi_day=edin_epi_day country=edin_admin_0" - }, - { - "suffix": "global_mutations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "mutations": true, - "exclude_cog": true - }, - { - "suffix": "global_constellations", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "constellations": true, - "exclude_cog": true - }, - { - "suffix": "global_updown", - "data": "gisaid", - "metadata_fields": ["sequence_name"], - "updown": true, - "exclude_cog": true - } - ], - "civet3": [ - { - "suffix": "private", - "fasta": "gisaid", - "metadata_fields": ["sequence_name","gisaid_id","sample_date","epi_week","country","adm1","adm2","travel_history","lineage","lineages_version","usher_lineage", "usher_lineages_version", "scorpio_call"], - "where": "gisaid_id=covv_accession_id epi_week=edin_epi_week country=edin_admin_0 adm1=edin_admin_1 adm2=edin_admin_2 travel_history=edin_travel" - }, - { - "suffix": "mutations", - "data": "gisaid", - "updown": true, - "metadata_fields": ["sequence_name", "query"], - "where": "query=sequence_name", - "drop_index": "sequence_name" - } - ] -} diff --git a/workflows/resources/publish_readme.txt b/workflows/resources/publish_readme.txt deleted file mode 100644 index ec36438..0000000 --- a/workflows/resources/publish_readme.txt +++ /dev/null @@ -1,36 +0,0 @@ -# Summary of published datapipe outputs - -### Alignments - -- `cog__all.fa` : all unaligned sequences after deduplication -- `cog__all_alignment.fa` : all aligned sequences after deduplication -- `cog__all_metadata.csv` : all corresponding metadata -- `cog__alignment.fa` : filtered, trimmed alignment with sequences matching those in the corresponding metadata -- `cog__metadata.csv` : corresponding metadata for filtered, trimmed alignment - -### Cog - -- `cog.insertions.tsv` and `cog.deletions.tsv` containing all found insertions and deletions for the UK sequences -- `UTLA_genome_counts_.csv` containing counts of delta sequences by date and UTLA - -### Metadata - -- `cog_global__geography.csv` : metadata containing the following columns `"central_sample_id","sequence_name","sample_date","epi_week","country","adm1","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location"` -- `cog_global__mutations.csv` : metadata containing the following columns `"sequence_name", "sample_date", "lineage","lineages_version"` and additionally columns for specifically typed mutations of interest -- `cog_global__public.csv` : metadata containing the following columns `"sequence_name","cog_id","gisaid_id","sample_date","epi_week","country","adm1","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineages_version"` -- `cog_global__consortium.csv` : metadata containing all columns as in the public metadata, extended with the following columns `"received_date","collection_date","published_date","sequencing_org_code","submission_org_code","submission_user","root_sample_id","adm2","outer_postcode","adm2_raw","adm2_source","NUTS1","region","latitude","longitude","location","utla","utla_code","suggested_adm2_grouping","source_age","source_sex","sample_type_collected","sample_type_received","swab_site","ct_n_ct_value","ct_n_test_kit","ct_n_test_platform","ct_n_test_target"` -- `cog__unlinked.csv` : shuffled metadata with no ids containing the following columns `"safe_sample_date","epi_week", "location","lineage","lineages_version","is_surveillance", "collection_pillar", "is_pillar_2"` -- `cog_global__epidemiology.csv` : metadata containing the following columns `"sequence_name","cog_id","gisaid_id","sample_date","epi_week","collection_date","received_date","sequencing_submission_date","sequencing_org_code","root_sample_id","biosample_source_id","country","adm1","adm2","utla","utla_code","outer_postcode","NUTS1","latitude","longitude","location","source_age","source_sex","collection_pillar","is_pillar_2","is_surveillance","is_travel_history","travel_history","lineage","lineage_support","lineages_version","scorpio_call","scorpio_support","ambiguity_count"` - -### Public - -- `cog__all.fa` : all unaligned sequences after deduplication -- `cog__unmasked_alignment.fa` : all aligned sequences -- `cog__alignment.fa` : filtered, trimmed alignment with sequences matching those in the corresponding metadata -- `cog__metadata.csv` : corresponding metadata for filtered, trimmed alignment with the following columns `"sequence_name", "country","adm1","is_pillar_2","sample_date", "epi_week","lineage","lineages_version"` - -### Civet3 -- `cog_global__private_alignment.fa` : masked, trimmed, filtered alignment of COG and GLOBAL sequences -- `cog_global__private_metadata.csv` : corresponding metadata with the following columns `"sequence_name","gisaid_id","cog_id","source_id","sample_date","epi_week","country","adm1","adm2","suggested_adm2_grouping","outer_postcode","is_surveillance","is_travel_history","travel_history","is_pillar_2","collection_pillar","lineage","lineages_version","scorpio_call"` -- `cog_global__mutations.csv` : metadata file produced by gofasta updown list, providing information about nucleotide mutations and ambiguous regions in aligned sequences - diff --git a/workflows/resources/resequencing_omissions.txt b/workflows/resources/resequencing_omissions.txt deleted file mode 100644 index cd52f25..0000000 --- a/workflows/resources/resequencing_omissions.txt +++ /dev/null @@ -1,15 +0,0 @@ -# resequencing -England/PHEC-20170/2020 - -England/PHEC-1AFD9/2020 -England/PHEC-1AFF7/2020 -England/PHEC-1B002/2020 -England/PHEC-1B011/2020 -England/PHEC-1B020/2020 -England/PHEC-1B03F/2020 -England/PHEC-1B04E/2020 -England/PHEC-1B05D/2020 -England/PHEC-1B06C/2020 -England/PHEC-1B07B/2020 -England/PHEC-1B08A/2020 -England/PHEC-1B099/2020 From 343dbeebc541fb9bf456da25117e6bbc8de552b1 Mon Sep 17 00:00:00 2001 From: Tom Whalley Date: Tue, 7 Jan 2025 10:43:54 +0000 Subject: [PATCH 4/4] Update nextflow.config to remove slurm profile --- nextflow.config | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nextflow.config b/nextflow.config index 0909731..f9b0d9b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,10 +23,3 @@ process { maxRetries = 5 } } - -profiles { - slurm { - process.executor = 'slurm' - process.clusterOptions='--account=lomannj-covid-19-realtime-epidemiology --qos=lomannj --time 600:0 --nodes 1' - } -}