diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..b0edf65793 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_111_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_111_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_111_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_111_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; diff --git a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl index de55722396..b1f4004f2c 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/protein_function_prediction_matrices.pl @@ -6,6 +6,10 @@ use Digest::MD5 qw(md5 md5_hex md5_base64); use JSON; +#use lib "~/appl/cellbase/build/scripts/ensembl-scripts/"; +#use lib "~/soft/ensembl-variation/modules/"; +#use lib "~/soft/ensembl/modules/"; + use DB_CONFIG; my $species = 'Homo sapiens'; @@ -87,6 +91,37 @@ #} #print join("=", $polyphen2->get_prediction(1, 'G'))."\n"; +################################################################## + +# Get the current time +my ($sec, $min, $hour, $mday, $mon, $year) = localtime(); +# Adjust the year and month values (year is years since 1900, and month is 0-based) + +$year += 1900; +$mon += 1; + +# Format the date and time +my $formatted_date = sprintf("%04d%02d%02d_%02d%02d%02d", $year, $mon, $mday, $hour, $min, $sec); + +my $jsonVersion = {}; +$jsonVersion->{"date"} = $formatted_date; +$jsonVersion->{"data"} = "protein_substitution_predictions"; +$jsonVersion->{"version"} = "Ensembl 104"; +my @urls = (); +push @urls, "ensembldb.ensembl.org:3306"; +$jsonVersion->{"url"} = \@urls; + +print "Generating the JSON file for the Sift version.\n"; +$jsonVersion->{"name"} = "sift"; +open(FILE, ">".$outdir."/siftVersion.json") || die "error opening file\n"; +print FILE to_json($jsonVersion) . "\n"; +close(FILE); + +print "Generating the JSON file for the PolyPhen version\n"; +$jsonVersion->{"name"} = "polyphen"; +open(FILE, ">".$outdir."/polyphenVersion.json") || die "error opening file\n"; +print FILE to_json($jsonVersion) . "\n"; +close(FILE); my ($translation, $seq, $md5seq, @preds, @all_predictions); #my @transcripts = @{$transcript_adaptor->fetch_all_by_biotype('protein_coding')}; @@ -126,42 +161,56 @@ ## HASH ## my $effect = {}; + $effect->{"chromosome"} = $trans->seq_region_name; $effect->{"transcriptId"} = $trans->stable_id; - $effect->{"checksum"} = $md5seq; - $effect->{"size"} = length($seq); foreach my $u (@{ $trans->get_all_xrefs('Uniprot/SWISSPROT') }){ $effect->{"uniprotId"} = $u->display_id(); } + $effect->{"source"} = "polyphen"; my $polyphen2 = $prot_function_adaptor->fetch_polyphen_predictions_by_translation_md5($md5seq); - for(my $i=1; $i<=length($seq); $i++) { - foreach (my $j=0; $j < @aa_code; $j++) { - if(defined $polyphen2) { + if(defined $polyphen2) { + for(my $i=1; $i<=length($seq); $i++) { + $effect->{"aaPosition"} = $i; + my @scores = (); + foreach (my $j=0; $j < @aa_code; $j++) { @preds = $polyphen2->get_prediction($i, $aa_code[$j]); - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"pe"} = $effect_code{$preds[0]}; - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"ps"} = $preds[1]; + if(defined $preds[0] || defined $preds[1]) { + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[1], "effect" => $preds[0]}; + $effect->{"scores"} = \@scores; + } + } + if(@scores) { + print FILE to_json($effect)."\n"; } } } - my $sift = $prot_function_adaptor->fetch_sift_predictions_by_translation_md5($md5seq); - for(my $i=1; $i<=length($seq); $i++) { - foreach (my $j=0; $j < @aa_code; $j++) { - if(defined $sift) { - @preds = $sift->get_prediction($i, $aa_code[$j]); - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"se"} = $effect_code{$preds[0]}; - $effect->{"aaPositions"}->{$i}->{$aa_code[$j]}->{"ss"} = $preds[1]; - } - } - } - print FILE to_json($effect)."\n"; + $effect->{"source"} = "sift"; + my $sift = $prot_function_adaptor->fetch_sift_predictions_by_translation_md5($md5seq); + if(defined $sift) { + for(my $i=1; $i<=length($seq); $i++) { + $effect->{"aaPosition"} = $i; + my @scores = (); + foreach (my $j=0; $j < @aa_code; $j++) { + @preds = $sift->get_prediction($i, $aa_code[$j]); + if(defined $preds[0] || defined $preds[1]) { + push @scores, {"aaAlternate" => $aa_code[$j], "score" => $preds[1], "effect" => $preds[0]}; + $effect->{"scores"} = \@scores; + } + } + if(@scores) { + print FILE to_json($effect)."\n"; + } + } + } } } close(FILE); ## GZip output to save space in Amazon AWS -# exec("gzip prot_func_pred_chr_".$chrom->seq_region_name); + exec("gzip " . $outdir . "/prot_func_pred_chr_" . $chr->seq_region_name . ".json"); } sub print_parameters { diff --git a/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh new file mode 100755 index 0000000000..38c7d1efa2 --- /dev/null +++ b/cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# The original MirTarBase hsa_MTI.xlsx contains invalid Gene Symbols in 793 lines. +# To fix it, that file has to be converted to a CSV file, i.e.: hsa_MTI.csv +# +# After converting to CSV file, we can see the errors from the original file for the Gene Symbols (column 4), +# e.g.: 06-mar: +# MIRT050267,hsa-miR-25-3p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,06-mar,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# +# This script fix those lines and convert the column 4 for a vaild Gene Symbol: +# +# MIRT050267,hsa-miR-25-3p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 +# MIRT051174,hsa-miR-16-5p,Homo sapiens,MARCHF6,10299,Homo sapiens,CLASH,Functional MTI (Weak),23622248 + +# Check the parameters number +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Check CSV file +csv_file="$1" +if [ ! -f "$csv_file" ]; then + echo "CSV file '$csv_file' does not exist." + exit 1 +fi + +# Fix gene-symbol +while IFS=$'\t' read -r c1 c2 c3 c4 c5 c6 c7 c8 c9 || [[ -n "$c1" ]]; do + # Aplica las condiciones + if [ "$c5" = "10299" ]; then + c4="MARCHF6" + elif [ "$c5" = "51257" ]; then + c4="MARCHF2" + elif [ "$c5" = "54708" ]; then + c4="MARCHF5" + elif [ "$c5" = "54996" ]; then + c4="MTARC2" + elif [ "$c5" = "55016" ]; then + c4="MARCHF1" + elif [ "$c5" = "57574" ]; then + c4="MARCHF4" + elif [ "$c5" = "64757" ]; then + c4="MTARC1" + elif [ "$c5" = "64844" ]; then + c4="MARCHF7" + elif [ "$c5" = "92979" ]; then + c4="MARCHF9" + elif [ "$c5" = "115123" ]; then + c4="MARCHF3" + elif [ "$c5" = "220972" ]; then + c4="MARCHF8" + elif [ "$c5" = "441061" ]; then + c4="MARCHF11" + fi + + # Print line + echo -e "$c1\t$c2\t$c3\t$c4\t$c5\t$c6\t$c7\t$c8\t$c9" +done < "$csv_file" diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..4f830c6e43 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -19,11 +19,14 @@ import com.beust.jcommander.*; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; +import org.opencb.cellbase.lib.EtlCommons; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -87,12 +90,15 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: " + GENOME_DATA + "," + GENE_DATA + + "," + VARIATION_FUNCTIONAL_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," + CONSERVATION_DATA + "," + + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + PUBMED_DATA + "," + PHARMACOGENOMICS_DATA + + "," + PGS_DATA + "," + REVEL_DATA + "," + ALPHAMISSENSE_DATA + "; or use 'all' to download everything", required = true, + arity = 1) public String data; - @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) + @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, + arity = 1) public String outputDirectory; } @@ -102,9 +108,11 @@ public class BuildCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, " - + "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, " - + "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: " + GENOME_DATA + "," + GENE_DATA + "," + + VARIATION_FUNCTIONAL_SCORE_DATA + "," + REGULATION_DATA + "," + PROTEIN_DATA + "," + CONSERVATION_DATA + "," + + CLINICAL_VARIANT_DATA + "," + REPEATS_DATA + "," + ONTOLOGY_DATA + "," + SPLICE_SCORE_DATA + "," + PUBMED_DATA + "," + + PHARMACOGENOMICS_DATA + "," + PGS_DATA + "," + REVEL_DATA + "," + ALPHAMISSENSE_DATA + "; or use 'all' to build" + + " everything", required = true, arity = 1) public String data; @Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1) @@ -190,8 +198,9 @@ public class LoadCommandOptions { public CommonCommandOptions commonOptions = commonCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," - + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics." - + " 'all' loads everything", required = true, arity = 1) + + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics," + + " protein_functional_prediction, missense_variation_functional_score, alphamissense; and 'all' loads everything", + required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, @@ -237,8 +246,8 @@ public class ExportCommandOptions { public CommonCommandOptions commonOptions = commonCommandOptions; @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, " - + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' " - + " loads everything", required = true, arity = 1) + + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA + ", conservation, regulation, protein, clinical_variants, repeats," + + " regulatory_pfm, splice_score, pubmed. 'all' export everything", required = true, arity = 1) public String data; @Parameter(names = {"--db", "--database"}, description = "Database name, e.g., cellbase_hsapiens_grch38_v5", required = true, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java index 10c43d637c..fecf57c08a 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminMain.java @@ -98,10 +98,10 @@ public static void main(String[] args) { commandExecutor.execute(); } catch (IOException | URISyntaxException | CellBaseException e) { commandExecutor.getLogger().error("Error: " + e.getMessage()); + e.printStackTrace(); System.exit(1); } } } } - } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8c0d477023..380cbdaaba 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -17,11 +17,14 @@ package org.opencb.cellbase.app.cli.admin.executors; import com.beust.jcommander.ParameterException; -import org.apache.commons.lang.StringUtils; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; @@ -33,12 +36,16 @@ import java.io.File; import java.io.IOException; -import java.nio.file.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA; +import static org.opencb.cellbase.core.utils.SpeciesUtils.getSpeciesShortname; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by imedina on 03/02/15. @@ -51,11 +58,16 @@ public class BuildCommandExecutor extends CommandExecutor { private Path downloadFolder = null; // /_/download private boolean normalize = true; - private File ensemblScriptsFolder; + private SpeciesConfiguration.Assembly assembly; + private String ensemblRelease; private boolean flexibleGTFParsing; private SpeciesConfiguration speciesConfiguration; + private static final List VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, + PUBMED_DATA, PHARMACOGENOMICS_DATA, REVEL_DATA, ALPHAMISSENSE_DATA, PGS_DATA); + public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -63,16 +75,20 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma this.output = Paths.get(buildCommandOptions.outputDirectory); normalize = !buildCommandOptions.skipNormalize; - this.ensemblScriptsFolder = new File(System.getProperty("basedir") + "/bin/ensembl-scripts/"); this.flexibleGTFParsing = buildCommandOptions.flexibleGTFParsing; } - /** * Parse specific 'build' command options. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { + String data = null; try { + // Check data sources + List dataList = checkDataSources(); + // Output directory need to be created if it doesn't exist if (!Files.exists(output)) { Files.createDirectories(output); @@ -82,7 +98,9 @@ public void execute() { if (speciesConfiguration == null) { throw new CellBaseException("Invalid species: '" + buildCommandOptions.species + "'"); } - SpeciesConfiguration.Assembly assembly = null; + + SpeciesConfiguration.Assembly assembly; + if (!StringUtils.isEmpty(buildCommandOptions.assembly)) { assembly = SpeciesUtils.getAssembly(speciesConfiguration, buildCommandOptions.assembly); if (assembly == null) { @@ -92,7 +110,10 @@ public void execute() { assembly = SpeciesUtils.getDefaultAssembly(speciesConfiguration); } - String spShortName = SpeciesUtils.getSpeciesShortname(speciesConfiguration); + String ensemblVersion = assembly.getEnsemblVersion(); + ensemblRelease = "release-" + ensemblVersion.split("_")[0]; + + String spShortName = getSpeciesShortname(speciesConfiguration); String spAssembly = assembly.getName().toLowerCase(); Path spFolder = output.resolve(spShortName + "_" + spAssembly); // /_/download @@ -106,262 +127,211 @@ public void execute() { makeDir(buildFolder); } - if (buildCommandOptions.data != null) { - String[] buildOptions; - if (buildCommandOptions.data.equals("all")) { - buildOptions = speciesConfiguration.getData().toArray(new String[0]); - } else { - buildOptions = buildCommandOptions.data.split(","); + CellBaseBuilder parser; + for (int i = 0; i < dataList.size(); i++) { + data = dataList.get(i); + switch (data) { + case GENOME_DATA: + parser = buildGenomeSequence(); + break; + case GENE_DATA: + parser = buildGene(); + break; + case VARIATION_FUNCTIONAL_SCORE_DATA: + parser = buildCadd(); + break; + case REVEL_DATA: + parser = buildRevel(); + break; + case REGULATION_DATA: + parser = buildRegulation(); + break; + case PROTEIN_DATA: + parser = buildProtein(); + break; + case CONSERVATION_DATA: + parser = buildConservation(); + break; + case CLINICAL_VARIANT_DATA: + parser = buildClinicalVariants(); + break; + case REPEATS_DATA: + parser = buildRepeats(); + break; + case ONTOLOGY_DATA: + parser = buildObo(); + break; + case SPLICE_SCORE_DATA: + parser = buildSplice(); + break; + case PUBMED_DATA: + parser = buildPubMed(); + break; + case PHARMACOGENOMICS_DATA: + parser = buildPharmacogenomics(); + break; + case PGS_DATA: + parser = buildPolygenicScores(); + break; + case ALPHAMISSENSE_DATA: + parser = buildAlphaMissense(); + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter." + + " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build" + + " everything"); } - for (int i = 0; i < buildOptions.length; i++) { - String buildOption = buildOptions[i]; - - logger.info("Building '{}' data", buildOption); - CellBaseBuilder parser = null; - switch (buildOption) { -// case EtlCommons.GENOME_INFO_DATA: -// buildGenomeInfo(); -// break; - case EtlCommons.GENOME_DATA: - parser = buildGenomeSequence(); - break; - case EtlCommons.GENE_DATA: - parser = buildGene(); - break; - case EtlCommons.REFSEQ_DATA: - parser = buildRefSeq(); - break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: - parser = buildCadd(); - break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - parser = buildRevel(); - break; - case EtlCommons.REGULATION_DATA: - parser = buildRegulation(); - break; - case EtlCommons.PROTEIN_DATA: - parser = buildProtein(); - break; -// case EtlCommons.PPI_DATA: -// parser = getInteractionParser(); -// break; - case EtlCommons.CONSERVATION_DATA: - parser = buildConservation(); - break; - case EtlCommons.CLINICAL_VARIANTS_DATA: - parser = buildClinicalVariants(); - break; - case EtlCommons.REPEATS_DATA: - parser = buildRepeats(); - break; - case EtlCommons.OBO_DATA: - parser = buildObo(); - break; - case EtlCommons.SPLICE_SCORE_DATA: - parser = buildSplice(); - break; - case EtlCommons.PUBMED_DATA: - parser = buildPubMed(); - break; - case EtlCommons.PHARMACOGENOMICS_DATA: - parser = buildPharmacogenomics(); - break; - default: - logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); - break; - } - - if (parser != null) { - try { - parser.parse(); - } catch (Exception e) { - logger.error("Error executing 'build' command " + buildCommandOptions.data + ": " + e.getMessage(), e); - } - parser.disconnect(); - } + if (parser != null) { + parser.parse(); + parser.disconnect(); } } - } catch (ParameterException e) { - logger.error("Error parsing build command line parameters: " + e.getMessage(), e); - } catch (IOException | CellBaseException e) { - logger.error(e.getMessage()); + } catch (Exception e) { + String msg = "Error executing the command 'build'"; + if (StringUtils.isNotEmpty(data)) { + msg += ". The last data being built was '" + data + "'"; + } + throw new CellBaseException(msg + ": " + e.getMessage(), e); } } - private CellBaseBuilder buildRepeats() { - Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILE))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILE))); - copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILE))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON); - return new RepeatsBuilder(repeatsFilesDir, serializer); + private CellBaseBuilder buildRepeats() throws CellBaseException { + // Sanity check + Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA); + List versionPaths = Arrays.asList(repeatsDownloadPath.resolve(getDataVersionFilename(TRF_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(GSD_DATA)), + repeatsDownloadPath.resolve(getDataVersionFilename(WM_DATA))); + copyVersionFiles(versionPaths, buildFolder.resolve(REPEATS_DATA)); + + // Create serializer and return the repeats builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_DATA), REPEATS_BASENAME); + return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration); } - private CellBaseBuilder buildObo() { - Path oboDir = downloadFolder.resolve(EtlCommons.OBO_DATA); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.OBO_JSON); - return new OntologyBuilder(oboDir, serializer); + private CellBaseBuilder buildObo() throws CellBaseException { + Path oboDownloadPath = downloadFolder.resolve(ONTOLOGY_DATA); + Path oboBuildPath = buildFolder.resolve(ONTOLOGY_DATA); + List versionPaths = Arrays.asList(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), + oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA))); + copyVersionFiles(versionPaths, oboBuildPath); + + // Create serializer and return the ontology builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(oboBuildPath, OBO_BASENAME); + return new OntologyBuilder(oboDownloadPath, serializer); } + /** + * @deprecated (when using the new copyVersionFiles) + */ + @Deprecated private void copyVersionFiles(List pathList) { for (Path path : pathList) { try { Files.copy(path, downloadFolder.resolve(path.getFileName()), StandardCopyOption.REPLACE_EXISTING); } catch (IOException e) { - logger.warn("Version file {} not found - skipping", path.toString()); + logger.warn("Version file {} not found - skipping", path); } } } -// private void buildGenomeInfo() { -// /** -// * To get some extra info about the genome such as chromosome length or cytobands -// * we execute the following script. -// */ -// try { -// String outputFileName = downloadFolder.resolve("genome_info.json").toAbsolutePath().toString(); -// List args = new ArrayList<>(); -// args.addAll(Arrays.asList("--species", speciesConfigurathtion.getScientificName(), -// "--assembly", buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, -// "-o", outputFileName, -// "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs())); -// if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration) -// && !speciesConfiguration.getScientificName().equals("Drosophila melanogaster")) { -// args.add("--phylo"); -// args.add("no-vertebrate"); -// } -// -// String geneInfoLogFileName = downloadFolder.resolve("genome_info.log").toAbsolutePath().toString(); -// -// boolean downloadedGenomeInfo; -// downloadedGenomeInfo = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, "./genome_info.pl", args, geneInfoLogFileName); -// -// if (downloadedGenomeInfo) { -// logger.info(outputFileName + " created OK"); -// } else { -// logger.error("Genome info for " + speciesConfiguration.getScientificName() + " cannot be downloaded"); -// } -// } catch (IOException | InterruptedException e) { -// e.printStackTrace(); -// } -// } - - private CellBaseBuilder buildGenomeSequence() { - copyVersionFiles(Collections.singletonList(downloadFolder.resolve("genome/genomeVersion.json"))); - Path fastaFile = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "genome_sequence"); - return new GenomeSequenceFastaBuilder(fastaFile, serializer); + private CellBaseBuilder buildGenomeSequence() throws CellBaseException { + // Sanity check + Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA)); + copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA)); + + // Get FASTA path + Path fastaPath = getFastaReferenceGenome(); + + // Create serializer and return the genome builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA); + return new GenomeSequenceFastaBuilder(fastaPath, serializer); } private CellBaseBuilder buildGene() throws CellBaseException { - Path geneFolderPath = downloadFolder.resolve("gene"); - copyVersionFiles(Arrays.asList(geneFolderPath.resolve("dgidbVersion.json"), - geneFolderPath.resolve("ensemblCoreVersion.json"), geneFolderPath.resolve("uniprotXrefVersion.json"), - geneFolderPath.resolve("geneExpressionAtlasVersion.json"), - geneFolderPath.resolve("hpoVersion.json"), geneFolderPath.resolve("disgenetVersion.json"), - geneFolderPath.resolve("gnomadVersion.json"))); - Path genomeFastaFilePath = getFastaReferenceGenome(); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "gene"); - return new GeneBuilder(geneFolderPath, genomeFastaFilePath, speciesConfiguration, flexibleGTFParsing, serializer); + return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing); } - private CellBaseBuilder buildRefSeq() { - Path refseqFolderPath = downloadFolder.resolve("refseq"); - copyVersionFiles(Arrays.asList(refseqFolderPath.resolve("refSeqVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "refseq"); - return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); - } + private CellBaseBuilder buildCadd() throws CellBaseException { + // Sanity check + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Path caddBuildPath = buildFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + copyVersionFiles(Collections.singletonList(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA))), caddBuildPath); - private CellBaseBuilder buildCadd() { - Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score"); - copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json"))); - Path caddFilePath = variationFunctionalScorePath.resolve("whole_genome_SNVs.tsv.gz"); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "cadd"); - return new CaddScoreBuilder(caddFilePath, serializer); + // Create the file serializer and the protein builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(caddBuildPath, CADD_DATA); + return new CaddScoreBuilder(caddDownloadPath, serializer); } - private CellBaseBuilder buildRevel() { - Path missensePredictionScorePath = downloadFolder.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - copyVersionFiles(Arrays.asList(missensePredictionScorePath.resolve("revelVersion.json"))); - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.MISSENSE_VARIATION_SCORE_DATA); - return new RevelScoreBuilder(missensePredictionScorePath, serializer); - } + private CellBaseBuilder buildRevel() throws CellBaseException { + // Sanity check + Path revelDownloadPath = downloadFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + Path revelBuildPath = buildFolder.resolve(MISSENSE_VARIATION_SCORE_DATA).resolve(REVEL_DATA); + copyVersionFiles(Collections.singletonList(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA))), revelBuildPath); - private CellBaseBuilder buildRegulation() { - Path regulatoryRegionFilesDir = downloadFolder.resolve("regulation"); - copyVersionFiles(Collections.singletonList(regulatoryRegionFilesDir.resolve("ensemblRegulationVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_region"); - return new RegulatoryFeatureBuilder(regulatoryRegionFilesDir, serializer); + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(revelBuildPath, REVEL_DATA); + return new RevelScoreBuilder(revelDownloadPath, serializer); } - private CellBaseBuilder buildProtein() { - Path proteinFolder = downloadFolder.resolve("protein"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("uniprotVersion.json"), - proteinFolder.resolve("interproVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein"); - return new ProteinBuilder(proteinFolder.resolve("uniprot_chunks"), - downloadFolder.resolve("protein").resolve("protein2ipr.dat.gz"), speciesConfiguration.getScientificName(), serializer); - } + private CellBaseBuilder buildRegulation() throws CellBaseException { + // Sanity check + Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA); + Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA); + copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)), + regulationDownloadPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))), regulationBuildPath); - private void getProteinFunctionPredictionMatrices(SpeciesConfiguration sp, Path geneFolder) - throws IOException, InterruptedException { - logger.info("Downloading protein function prediction matrices ..."); - - // run protein_function_prediction_matrices.pl - String proteinFunctionProcessLogFile = geneFolder.resolve("protein_function_prediction_matrices.log").toString(); - List args = Arrays.asList("--species", sp.getScientificName(), "--outdir", geneFolder.toString(), - "--ensembl-libs", configuration.getDownload().getEnsembl().getLibs()); - - boolean proteinFunctionPredictionMatricesObtaines = EtlCommons.runCommandLineProcess(ensemblScriptsFolder, - "./protein_function_prediction_matrices.pl", - args, - proteinFunctionProcessLogFile); - - // check output - if (proteinFunctionPredictionMatricesObtaines) { - logger.info("Protein function prediction matrices created OK"); - } else { - logger.error("Protein function prediction matrices for " + sp.getScientificName() + " cannot be downloaded"); - } + // Create the file serializer and the regulatory feature builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME); + return new RegulatoryFeatureBuilder(regulationDownloadPath, serializer); } - private CellBaseBuilder getInteractionParser() { - Path proteinFolder = downloadFolder.resolve("protein"); - Path psimiTabFile = proteinFolder.resolve("intact.txt"); - copyVersionFiles(Arrays.asList(proteinFolder.resolve("intactVersion.json"))); - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "protein_protein_interaction"); - return new InteractionBuilder(psimiTabFile, speciesConfiguration.getScientificName(), serializer); + private CellBaseBuilder buildProtein() throws CellBaseException { + // Sanity check + Path proteinDownloadPath = downloadFolder.resolve(PROTEIN_DATA); + Path proteinBuildPath = buildFolder.resolve(PROTEIN_DATA); + copyVersionFiles(Arrays.asList(proteinDownloadPath.resolve(getDataVersionFilename(UNIPROT_DATA)), + proteinDownloadPath.resolve(getDataVersionFilename(INTERPRO_DATA))), proteinBuildPath); + + // Create the file serializer and the protein builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(proteinBuildPath, PROTEIN_DATA); + return new ProteinBuilder(proteinDownloadPath, speciesConfiguration.getScientificName(), serializer); } - private CellBaseBuilder buildConservation() { - Path conservationFilesDir = downloadFolder.resolve("conservation"); - copyVersionFiles(Arrays.asList(conservationFilesDir.resolve("gerpVersion.json"), - conservationFilesDir.resolve("phastConsVersion.json"), - conservationFilesDir.resolve("phyloPVersion.json"))); - // TODO: chunk size is not really used in ConvervedRegionParser, remove? + private CellBaseBuilder buildConservation() throws CellBaseException { + // Sanity check + Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); + Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); + int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder); - return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); + return new ConservationBuilder(conservationDownloadPath, conservationChunkSize, serializer); } - private CellBaseBuilder buildClinicalVariants() { - Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); - copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json"))); - copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json"))); - - CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, - EtlCommons.CLINICAL_VARIANTS_JSON_FILE.replace(".json.gz", ""), true); - return new ClinicalVariantBuilder(clinicalVariantFolder, normalize, getFastaReferenceGenome(), + private CellBaseBuilder buildClinicalVariants() throws CellBaseException { + // Sanity check + Path clinicalDownloadPath = downloadFolder.resolve(CLINICAL_VARIANT_DATA); + Path clinicalBuildPath = buildFolder.resolve(CLINICAL_VARIANT_DATA); + copyVersionFiles(Arrays.asList(clinicalDownloadPath.resolve(getDataVersionFilename(CLINVAR_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(COSMIC_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(HGMD_DATA)), + clinicalDownloadPath.resolve(getDataVersionFilename(GWAS_DATA))), clinicalBuildPath); + + // Create the file serializer and the clinical variants builder + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(clinicalBuildPath, CLINICAL_VARIANTS_BASENAME, true); + return new ClinicalVariantBuilder(clinicalDownloadPath, normalize, getFastaReferenceGenome(), buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly, - serializer); + configuration, serializer); } private String getDefaultHumanAssembly() { for (SpeciesConfiguration species : configuration.getSpecies().getVertebrates()) { - if (species.getId().equals("hsapiens")) { + if (species.getId().equals(HSAPIENS_NAME)) { return species.getAssemblies().get(0).getName(); } } @@ -370,19 +340,30 @@ private String getDefaultHumanAssembly() { + "configuration file. No hsapiens data found within the configuration.json file"); } - private Path getFastaReferenceGenome() { - Path fastaFile = null; - try { - DirectoryStream stream = Files.newDirectoryStream(downloadFolder.resolve("genome"), entry -> { - return entry.toString().endsWith(".fa"); - }); - for (Path entry : stream) { - fastaFile = entry; + private Path getFastaReferenceGenome() throws CellBaseException { + // Check FASTA and unzip if necessary + String ensemblUrl = getEnsemblUrl(configuration.getDownload().getEnsembl(), ensemblRelease, ENSEMBL_PRIMARY_FA_FILE_ID, + getSpeciesShortname(speciesConfiguration), assembly.getName(), null); + String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); + Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); + if (fastaPath.toFile().exists()) { + // Gunzip + logger.info("Gunzip file: {}", fastaPath); + try { + EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e); } - } catch (IOException e) { - e.printStackTrace(); } - return fastaFile; + fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(".gz", "")); + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; } private CellBaseBuilder buildSplice() throws IOException { @@ -402,39 +383,136 @@ private CellBaseBuilder buildSplice() throws IOException { return new SpliceBuilder(spliceInputFolder, serializer); } - private CellBaseBuilder buildPubMed() throws IOException { - Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA); - Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA); - if (!pubmedOutputFolder.toFile().exists()) { - pubmedOutputFolder.toFile().mkdirs(); + private CellBaseBuilder buildPubMed() throws CellBaseException { + // Sanity check + Path pubMedDownloadPath = downloadFolder.resolve(PUBMED_DATA); + Path pubMedBuildPath = buildFolder.resolve(PUBMED_DATA); + copyVersionFiles(Collections.singletonList(pubMedDownloadPath.resolve(getDataVersionFilename(PUBMED_DATA))), pubMedBuildPath); + + // Create the file serializer and the PubMed builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubMedBuildPath); + return new PubMedBuilder(pubMedDownloadPath, serializer, configuration); + } + + private CellBaseBuilder buildPharmacogenomics() throws CellBaseException { + // Sanity check + Path pharmGkbDownloadPath = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Path pharmGkbBuildPath = buildFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + copyVersionFiles(Arrays.asList(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA))), pharmGkbBuildPath); + + // Create the file serializer and the PharmGKB feature builder + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pharmGkbBuildPath); + return new PharmGKBBuilder(pharmGkbDownloadPath, serializer); + } + + private void checkVersionFiles(List versionPaths) throws CellBaseException { + ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + for (Path versionPath : versionPaths) { + if (!versionPath.toFile().exists()) { + throw new CellBaseException("Version file " + versionPath + " does not exist: this file is mandatory for version control"); + } + try { + DataSource dataSource = dataSourceReader.readValue(versionPath.toFile()); + if (org.apache.commons.lang3.StringUtils.isEmpty(dataSource.getVersion())) { + throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the" + + " file"); + } + } catch (IOException e) { + throw new CellBaseException("Error parsing the version file " + versionPath, e); + } } + } - logger.info("Copying PubMed version file..."); - if (pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME).toFile().exists()) { - Files.copy(pubmedInputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - pubmedOutputFolder.resolve(EtlCommons.PUBMED_VERSION_FILENAME), - StandardCopyOption.REPLACE_EXISTING); + private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { + // Check version files before copying them + checkVersionFiles(versionPaths); + if (!targetPath.toFile().exists()) { + try { + Files.createDirectories(targetPath); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + targetPath, e); + } } - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder); - return new PubMedBuilder(pubmedInputFolder, serializer); + for (Path versionPath : versionPaths) { + try { + Files.copy(versionPath, targetPath.resolve(versionPath.getFileName()), StandardCopyOption.REPLACE_EXISTING); + } catch (IOException e) { + throw new CellBaseException("Error copying version file " + versionPath + " to " + targetPath, e); + } + // Sanity check after copying + if (!targetPath.resolve(versionPath.getFileName()).toFile().exists()) { + throw new CellBaseException("Something wrong happened when copying version file " + versionPath + " to " + targetPath); + } + } + } + + private List checkDataSources() { + if (StringUtils.isEmpty(buildCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to download everything"); + } + List dataList = Arrays.asList(buildCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case REFSEQ_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANT_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case SPLICE_SCORE_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + case PGS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build everything"); + } + } + return dataList; } - private CellBaseBuilder buildPharmacogenomics() throws IOException { - Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); - Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); + private CellBaseBuilder buildPolygenicScores() throws IOException { + Path inFolder = downloadFolder.resolve(EtlCommons.PGS_DATA); + Path outFolder = buildFolder.resolve(EtlCommons.PGS_DATA); if (!outFolder.toFile().exists()) { outFolder.toFile().mkdirs(); } - logger.info("Copying PharmGKB version file..."); - if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) { - Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), - outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), + logger.info("Copying PGS version file..."); + if (inFolder.resolve(PGS_CATALOG_VERSION_FILENAME).toFile().exists()) { + Files.copy(inFolder.resolve(PGS_CATALOG_VERSION_FILENAME), outFolder.resolve(PGS_CATALOG_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); } - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); - return new PharmGKBBuilder(inFolder, serializer); + String basename = PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME.split("\\.")[0]; + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder, basename); + return new PolygenicScoreBuilder(PGS_CATALOG_NAME, configuration.getDownload().getPgs().getVersion(), inFolder, serializer); + } + + private CellBaseBuilder buildAlphaMissense() throws IOException { + Path inputFolder = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + Path outputFolder = buildFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + if (!outputFolder.toFile().exists()) { + outputFolder.toFile().mkdirs(); + } + + logger.info("Copying AlphaMissense version file..."); + if (inputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME).toFile().exists()) { + Files.copy(inputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), + outputFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING); + } + + File alphaMissenseFile = inputFolder.resolve(EtlCommons.ALPHAMISSENSE_RAW_FILENAME).toFile(); + String basename = EtlCommons.ALPHAMISSENSE_JSON_FILENAME.replace(".json.gz", ""); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outputFolder, basename); + return new AlphaMissenseBuilder(alphaMissenseFile, serializer); } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..faf383ba26 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -16,26 +16,22 @@ package org.opencb.cellbase.app.cli.admin.executors; -import com.beust.jcommander.ParameterException; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.app.cli.CommandExecutor; import org.opencb.cellbase.app.cli.admin.AdminCliOptionsParser; -import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.utils.SpeciesUtils; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.download.AbstractDownloadManager; import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.cellbase.lib.download.Downloader; -import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -44,6 +40,10 @@ public class DownloadCommandExecutor extends CommandExecutor { private AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions; private Path outputDirectory; + private static final List VALID_SOURCES_TO_DOWNLOAD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, + REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA, ONTOLOGY_DATA, PUBMED_DATA, + PHARMACOGENOMICS_DATA, REVEL_DATA, ALPHAMISSENSE_DATA, PGS_DATA); + public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions downloadCommandOptions) { super(downloadCommandOptions.commonOptions.logLevel, downloadCommandOptions.commonOptions.conf); @@ -52,88 +52,103 @@ public DownloadCommandExecutor(AdminCliOptionsParser.DownloadCommandOptions down } /** - * Execute specific 'download' command options. + * Process CellBase command 'download'. + * + * @throws CellBaseException Exception */ - public void execute() { + public void execute() throws CellBaseException { try { String species = downloadCommandOptions.speciesAndAssemblyOptions.species; String assembly = downloadCommandOptions.speciesAndAssemblyOptions.assembly; List downloadFiles = new ArrayList<>(); - List dataList = getDataList(species); + List dataList = checkDataSources(); Downloader downloader = new Downloader(species, assembly, outputDirectory, configuration); for (String data : dataList) { switch (data) { - case EtlCommons.GENOME_DATA: + case GENOME_DATA: downloadFiles.addAll(downloader.downloadGenome()); break; - case EtlCommons.GENE_DATA: + case GENE_DATA: downloadFiles.addAll(downloader.downloadGene()); break; -// case EtlCommons.VARIATION_DATA: -// downloadManager.downloadVariation(); -// break; - case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: downloadFiles.addAll(downloader.downloadCaddScores()); break; - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: - downloadFiles.addAll(downloader.downloadPredictionScores()); + case REVEL_DATA: + downloadFiles.addAll(downloader.downloadRevelScores()); break; - case EtlCommons.REGULATION_DATA: + case REGULATION_DATA: downloadFiles.addAll(downloader.downloadRegulation()); break; - case EtlCommons.PROTEIN_DATA: + case PROTEIN_DATA: downloadFiles.addAll(downloader.downloadProtein()); break; - case EtlCommons.CONSERVATION_DATA: + case CONSERVATION_DATA: downloadFiles.addAll(downloader.downloadConservation()); break; - case EtlCommons.CLINICAL_VARIANTS_DATA: + case CLINICAL_VARIANT_DATA: downloadFiles.addAll(downloader.downloadClinicalVariants()); break; -// case EtlCommons.STRUCTURAL_VARIANTS_DATA: -// downloadFiles.add(downloadManager.downloadStructuralVariants()); -// break; - case EtlCommons.REPEATS_DATA: + case REPEATS_DATA: downloadFiles.addAll(downloader.downloadRepeats()); break; - case EtlCommons.OBO_DATA: + case ONTOLOGY_DATA: downloadFiles.addAll(downloader.downloadOntologies()); break; - case EtlCommons.PUBMED_DATA: + case PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; - case EtlCommons.PHARMACOGENOMICS_DATA: + case PHARMACOGENOMICS_DATA: downloadFiles.addAll(downloader.downloadPharmKGB()); break; - default: - System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" - + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," - + " regulation, protein, conservation, clinical_variants, ontology, pubmed}"); + case ALPHAMISSENSE_DATA: + downloadFiles.addAll(downloader.downloadAlphaMissense()); + break; + case PGS_DATA: + downloadFiles.addAll(downloader.downloadPolygenicScores()); break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } } AbstractDownloadManager.writeDownloadLogFile(outputDirectory, downloadFiles); - } catch (ParameterException | IOException | CellBaseException | InterruptedException | NoSuchMethodException - | FileFormatException e) { - logger.error("Error in 'download' command line: " + e.getMessage()); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); + } catch (Exception e) { + e.printStackTrace(); + throw new CellBaseException("Error executing command line 'download': " + e.getMessage(), e); } } - private List getDataList(String species) throws CellBaseException { - if (StringUtils.isEmpty(downloadCommandOptions.data) || downloadCommandOptions.data.equals("all")) { - return SpeciesUtils.getSpeciesConfiguration(configuration, species).getData(); - } else { - return Arrays.asList(downloadCommandOptions.data.split(",")); + private List checkDataSources() { + if (StringUtils.isEmpty(downloadCommandOptions.data)) { + throw new IllegalArgumentException("Missing data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); } - } - - @Deprecated - private List getDataList(SpeciesConfiguration sp) { - List dataList; - if (downloadCommandOptions.data.equals("all")) { - dataList = sp.getData(); - } else { - dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + List dataList = Arrays.asList(downloadCommandOptions.data.split(",")); + for (String data : dataList) { + switch (data) { + case GENOME_DATA: + case GENE_DATA: + case VARIATION_FUNCTIONAL_SCORE_DATA: + case MISSENSE_VARIATION_SCORE_DATA: + case REGULATION_DATA: + case PROTEIN_DATA: + case CONSERVATION_DATA: + case CLINICAL_VARIANT_DATA: + case REPEATS_DATA: + case ONTOLOGY_DATA: + case PUBMED_DATA: + case PHARMACOGENOMICS_DATA: + case PGS_DATA: + break; + default: + throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter. Valid values are: " + + StringUtils.join(VALID_SOURCES_TO_DOWNLOAD, ",") + "; or use 'all' to download everything"); + } } return dataList; } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 72f992f344..791cc599ef 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -82,11 +82,9 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo this.assembly = splits[2]; if (exportCommandOptions.data.equals("all")) { - this.dataToExport = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, - EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, - EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; + this.dataToExport = new String[]{GENOME_DATA, GENE_DATA, REFSEQ_DATA, CONSERVATION_DATA, REGULATION_DATA, PROTEIN_DATA, + PROTEIN_SUBSTITUTION_PREDICTION_DATA, VARIATION_DATA, VARIATION_FUNCTIONAL_SCORE_DATA, CLINICAL_VARIANT_DATA, + REPEATS_DATA, ONTOLOGY_DATA, SPLICE_SCORE_DATA, PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -200,38 +198,6 @@ public void execute() throws CellBaseException { counterMsg = counter + " CADD items"; break; } - case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output); - ProteinManager proteinManager = managerFactory.getProteinManager(species, assembly); - Map> positionMap = new HashMap<>(); - for (Variant variant : variants) { - if (!positionMap.containsKey(variant.getChromosome())) { - positionMap.put(variant.getChromosome(), new ArrayList<>()); - } - positionMap.get(variant.getChromosome()).add(variant.getStart()); - if (positionMap.get(variant.getChromosome()).size() >= 200) { - CellBaseDataResult results = proteinManager - .getMissenseVariantFunctionalScores(variant.getChromosome(), - positionMap.get(variant.getChromosome()), null, dataRelease); - counter += writeExportedData(results.getResults(), "missense_variation_functional_score", serializer); - positionMap.put(variant.getChromosome(), new ArrayList<>()); - } - } - - // Process map - for (Map.Entry> entry : positionMap.entrySet()) { - if (CollectionUtils.isEmpty(entry.getValue())) { - continue; - } - CellBaseDataResult results = proteinManager - .getMissenseVariantFunctionalScores(entry.getKey(), entry.getValue(), null, dataRelease); - counter += writeExportedData(results.getResults(), "missense_variation_functional_score", serializer); - } - serializer.close(); - - counterMsg = counter + " missense variation functional scores"; - break; - } case EtlCommons.CONSERVATION_DATA: { // Export data CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output); @@ -271,7 +237,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " proteins"; break; } - case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { + case EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA: { ProteinManager proteinManager = managerFactory.getProteinManager(species, assembly); Map> transcriptsMap = new HashMap<>(); for (Gene gene : genes) { @@ -290,10 +256,10 @@ public void execute() throws CellBaseException { } serializer.close(); - counterMsg = counter + " protein functional predictions"; + counterMsg = counter + " protein substitution predictions"; break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { counter = exportClinicalVariantData(regions); counterMsg = counter + " clinical variants"; break; @@ -309,7 +275,7 @@ public void execute() throws CellBaseException { counterMsg = counter + " repeats"; break; } - case OBO_DATA: { + case ONTOLOGY_DATA: { counter = exportOntologyData(); counterMsg = counter + " ontology items"; break; @@ -424,7 +390,7 @@ private String exportPharmacogenomicsData(List genes) private int exportClinicalVariantData(List regions) throws CellBaseException, QueryException, IllegalAccessException, IOException { - String baseFilename = CLINICAL_VARIANTS_DATA + ".full"; + String baseFilename = CLINICAL_VARIANT_DATA + ".full"; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, baseFilename); ClinicalManager clinicalManager = managerFactory.getClinicalManager(species, assembly); ClinicalVariantQuery query = new ClinicalVariantQuery(); @@ -449,7 +415,7 @@ private int exportClinicalVariantData(List regions) throws CellBaseExcep private int exportOntologyData() throws CellBaseException, IOException { int counter = 0; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, OBO_DATA); + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output, ONTOLOGY_DATA); OntologyManager ontologyManager = managerFactory.getOntologyManager(species, assembly); CellBaseIterator iterator = ontologyManager.iterator(new OntologyQuery()); while (iterator.hasNext()) { diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 5a8fd9417b..480e7ef09d 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -24,6 +24,7 @@ import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.lib.builders.PolygenicScoreBuilder; import org.opencb.cellbase.lib.impl.core.CellBaseDBAdaptor; import org.opencb.cellbase.lib.indexer.IndexManager; import org.opencb.cellbase.lib.loader.LoadRunner; @@ -44,6 +45,8 @@ import java.util.List; import java.util.concurrent.ExecutionException; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -79,9 +82,9 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO loadOptions = new String[]{EtlCommons.GENOME_DATA, EtlCommons.GENE_DATA, EtlCommons.REFSEQ_DATA, EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, - EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, - EtlCommons.PHARMACOGENOMICS_DATA}; + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANT_DATA, EtlCommons.REPEATS_DATA, + EtlCommons.ONTOLOGY_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, + EtlCommons.PUBMED_DATA, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PGS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -198,16 +201,21 @@ public void execute() throws CellBaseException { } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { // Load data - loadIfExists(input.resolve("missense_variation_functional_score.json.gz"), - "missense_variation_functional_score"); + Path path = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + loadIfExists(path.resolve(EtlCommons.MISSENSE_VARIATION_SCORE_JSON_FILENAME), + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Create index - createIndex("missense_variation_functional_score"); + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Update release (collection and sources) - List sources = new ArrayList<>(Collections.singletonList(input.resolve("revelVersion.json"))); - dataReleaseManager.update(dataRelease, "missense_variation_functional_score", - EtlCommons.MISSENSE_VARIATION_SCORE_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, + REVEL_DATA, Collections.singletonList(path.resolve(EtlCommons.REVEL_VERSION_FILENAME))); + break; + } + case EtlCommons.ALPHAMISSENSE_DATA: { + // Load data, create index and update release + loadAlphaMissense(); break; } case EtlCommons.CONSERVATION_DATA: { @@ -255,7 +263,7 @@ public void execute() throws CellBaseException { loadProteinFunctionalPrediction(); break; } - case EtlCommons.CLINICAL_VARIANTS_DATA: { + case EtlCommons.CLINICAL_VARIANT_DATA: { // Load data, create index and update release loadClinical(); break; @@ -268,7 +276,7 @@ public void execute() throws CellBaseException { // case EtlCommons.STRUCTURAL_VARIANTS_DATA: // loadStructuralVariants(); // break; - case EtlCommons.OBO_DATA: { + case EtlCommons.ONTOLOGY_DATA: { // Load data loadIfExists(input.resolve("ontology.json.gz"), "ontology"); @@ -281,7 +289,7 @@ public void execute() throws CellBaseException { input.resolve(EtlCommons.GO_VERSION_FILE), input.resolve(EtlCommons.DO_VERSION_FILE) )); - dataReleaseManager.update(dataRelease, "ontology", EtlCommons.OBO_DATA, sources); + dataReleaseManager.update(dataRelease, "ontology", EtlCommons.ONTOLOGY_DATA, sources); break; } case EtlCommons.SPLICE_SCORE_DATA: { @@ -289,7 +297,7 @@ public void execute() throws CellBaseException { loadSpliceScores(); break; } - case EtlCommons.PUBMED_DATA: { + case PUBMED_DATA: { // Load data, create index and update release loadPubMed(); break; @@ -299,6 +307,11 @@ public void execute() throws CellBaseException { loadPharmacogenomica(); break; } + case EtlCommons.PGS_DATA: { + // Load data, create index and update release + loadPolygenicScores(); + break; + } default: logger.warn("Not valid 'data'. We should not reach this point"); break; @@ -427,19 +440,52 @@ private void loadProteinFunctionalPrediction() throws NoSuchMethodException, Int InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { // Load data - DirectoryStream stream = Files.newDirectoryStream(input, + Path path = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + DirectoryStream stream = Files.newDirectoryStream(path, entry -> entry.getFileName().toString().startsWith("prot_func_pred_")); for (Path entry : stream) { logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "protein_functional_prediction", dataRelease); + loadRunner.load(path.resolve(entry.getFileName()), EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, dataRelease); } // Create index - createIndex("protein_functional_prediction"); + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); // Update release (collection and sources) - dataReleaseManager.update(dataRelease, "protein_functional_prediction", null, null); + String sourceName = null; + List sourceUrls = new ArrayList<>(); + if (path.resolve(EtlCommons.SIFT_VERSION_FILENAME).toFile().exists()) { + sourceUrls.add(path.resolve(EtlCommons.SIFT_VERSION_FILENAME)); + sourceName = EtlCommons.SIFT_SOURCE_NAME; + } + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sourceName, sourceUrls); + + sourceUrls = new ArrayList<>(); + if (path.resolve(EtlCommons.POLYPHEN_VERSION_FILENAME).toFile().exists()) { + sourceUrls.add(path.resolve(EtlCommons.POLYPHEN_VERSION_FILENAME)); + sourceName = EtlCommons.POLYPHEN_SOURCE_NAME; + } + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sourceName, sourceUrls); + } + + private void loadAlphaMissense() throws NoSuchMethodException, InterruptedException, ExecutionException, + InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, + IOException, CellBaseException, LoaderException { + Path proteinSubstitutionPath = input.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + + // Load data + Path alphamissensePath = proteinSubstitutionPath.resolve(EtlCommons.ALPHAMISSENSE_JSON_FILENAME); + logger.info("Loading file '{}'", alphamissensePath); + loadRunner.load(alphamissensePath, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, dataRelease); + + // Create index + createIndex(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + + // Update release (collection and sources) + List sources = Collections.singletonList(proteinSubstitutionPath.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME)); + dataReleaseManager.update(dataRelease, EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, + EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA, sources); } private void loadClinical() throws FileNotFoundException { @@ -459,7 +505,7 @@ private void loadClinical() throws FileNotFoundException { input.resolve("cosmicVersion.json"), input.resolve("gwasVersion.json") )); - dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANTS_DATA, sources); + dataReleaseManager.update(dataRelease, "clinical_variants", EtlCommons.CLINICAL_VARIANT_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException e) { logger.error(e.toString()); @@ -484,9 +530,9 @@ private void loadRepeats() { // Update release (collection and sources) List sources = new ArrayList<>(Arrays.asList( - input.resolve(EtlCommons.TRF_VERSION_FILE), - input.resolve(EtlCommons.GSD_VERSION_FILE), - input.resolve(EtlCommons.WM_VERSION_FILE) + input.resolve(getDataVersionFilename(TRF_DATA)), + input.resolve(getDataVersionFilename(GSD_DATA)), + input.resolve(getDataVersionFilename(WM_DATA)) )); dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException @@ -536,7 +582,7 @@ private void loadSpliceScores(Path spliceFolder) throws IOException, ExecutionEx } private void loadPubMed() throws CellBaseException { - Path pubmedPath = input.resolve(EtlCommons.PUBMED_DATA); + Path pubmedPath = input.resolve(PUBMED_DATA); if (Files.exists(pubmedPath)) { // Load data @@ -544,7 +590,7 @@ private void loadPubMed() throws CellBaseException { if (file.isFile() && (file.getName().endsWith("gz"))) { logger.info("Loading file '{}'", file.getName()); try { - loadRunner.load(file.toPath(), EtlCommons.PUBMED_DATA, dataRelease); + loadRunner.load(file.toPath(), PUBMED_DATA, dataRelease); } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException | IllegalAccessException | ExecutionException | IOException | InterruptedException | LoaderException e) { logger.error("Error loading file '{}': {}", file.getName(), e.toString()); @@ -552,11 +598,11 @@ private void loadPubMed() throws CellBaseException { } } // Create index - createIndex(EtlCommons.PUBMED_DATA); + createIndex(PUBMED_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources); + List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.getDataVersionFilename(PUBMED_DATA))); + dataReleaseManager.update(dataRelease, PUBMED_DATA, PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } @@ -585,10 +631,53 @@ private void loadPharmacogenomica() throws IOException, CellBaseException { createIndex(EtlCommons.PHARMACOGENOMICS_DATA); // Update release (collection and sources) - List sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME)); + List sources = Collections.singletonList(pharmaPath.resolve(getDataVersionFilename(PHARMGKB_DATA))); dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); } + private void loadPolygenicScores() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, + IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, CellBaseException, LoaderException { + Path pgsPath = input.resolve(EtlCommons.PGS_DATA); + + if (!Files.exists(pgsPath)) { + logger.warn("Polygenic scores (PGS) folder {} not found to load", pgsPath); + return; + } + + // Load common polygenic scores data + Path pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.COMMON_POLYGENIC_SCORE_FILENAME); + logger.info("Loading file '{}'", pathToLoad.toFile().getName()); + try { + loadRunner.load(pathToLoad, EtlCommons.PGS_COMMON_COLLECTION, dataRelease); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException + | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException + | LoaderException e) { + logger.error("Error loading file '{}': {}", pathToLoad.toFile().getName(), e.toString()); + } + + // Load variant polygenic scores data + pathToLoad = pgsPath.resolve(PolygenicScoreBuilder.VARIANT_POLYGENIC_SCORE_FILENAME); + logger.info("Loading file '{}'", pathToLoad.toFile().getName()); + try { + loadRunner.load(pathToLoad, EtlCommons.PGS_VARIANT_COLLECTION, dataRelease); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException + | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException + | LoaderException e) { + logger.error("Error loading file '{}': {}", pathToLoad.toFile().getName(), e.toString()); + } + + // Create index + createIndex(EtlCommons.PGS_COMMON_COLLECTION); + createIndex(EtlCommons.PGS_VARIANT_COLLECTION); + + // Update release (collection and sources) + List sources = new ArrayList<>(Arrays.asList( + input.resolve(EtlCommons.PGS_DATA + "/" + EtlCommons.PGS_CATALOG_VERSION_FILENAME) + )); + dataReleaseManager.update(dataRelease, EtlCommons.PGS_VARIANT_COLLECTION, EtlCommons.PGS_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PGS_COMMON_COLLECTION, null, null); + } + private void createIndex(String collection) { if (!createIndexes) { return; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java new file mode 100644 index 0000000000..106b01e1fe --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PolygenicScoreQuery.java @@ -0,0 +1,98 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.api; + +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.api.query.QueryParameter; + +import java.util.List; +import java.util.Map; + +public class PolygenicScoreQuery extends AbstractQuery { + + @QueryParameter(id = "id") + private List ids; + + @QueryParameter(id = "name") + private List names; + + @QueryParameter(id = "source", allowedValues = {"PGS Catalog"}) + private List sources; + + public PolygenicScoreQuery() { + } + + public PolygenicScoreQuery(Map params) throws QueryException { + super(params); + + objectMapper.readerForUpdating(this); + objectMapper.readerFor(PolygenicScoreQuery.class); + objectWriter = objectMapper.writerFor(PolygenicScoreQuery.class); + } + + @Override + protected void validateQuery() throws QueryException { + // Nothing to to + return; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("PolygenicScoreQuery{"); + sb.append("ids=").append(ids); + sb.append(", names=").append(names); + sb.append(", sources=").append(sources); + sb.append(", limit=").append(limit); + sb.append(", skip=").append(skip); + sb.append(", count=").append(count); + sb.append(", sort='").append(sort).append('\''); + sb.append(", order=").append(order); + sb.append(", facet='").append(facet).append('\''); + sb.append(", includes=").append(includes); + sb.append(", excludes=").append(excludes); + sb.append('}'); + return sb.toString(); + } + + public List getIds() { + return ids; + } + + public PolygenicScoreQuery setIds(List ids) { + this.ids = ids; + return this; + } + + public List getNames() { + return names; + } + + public PolygenicScoreQuery setNames(List names) { + this.names = names; + return this; + } + + public List getSources() { + return sources; + } + + public PolygenicScoreQuery setSources(List sources) { + this.sources = sources; + return this; + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 19f1606c91..8f2d714d26 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -16,7 +16,7 @@ package org.opencb.cellbase.core.config; -import java.util.List; +import java.util.Map; /** * Created by imedina on 19/08/16. @@ -28,16 +28,11 @@ public class DownloadProperties { private URLProperties hgnc; private URLProperties cancerHotspot; private URLProperties refSeq; - private URLProperties refSeqFasta; - private URLProperties refSeqProteinFasta; - private URLProperties refSeqCdna; private URLProperties maneSelect; private URLProperties lrg; - private URLProperties geneUniprotXref; private URLProperties geneExpressionAtlas; private URLProperties mirbase; - private URLProperties mirbaseReadme; private URLProperties targetScan; private URLProperties miRTarBase; private URLProperties uniprot; @@ -45,16 +40,12 @@ public class DownloadProperties { private URLProperties intact; private URLProperties interpro; private URLProperties interproRelNotes; - private URLProperties conservation; + private URLProperties phastCons; + private URLProperties phylop; private URLProperties gerp; private URLProperties clinvar; - private URLProperties clinvarVariation; - private URLProperties clinvarSummary; - private URLProperties clinvarVariationAllele; - private URLProperties clinvarEfoTerms; - private URLProperties iarctp53; - private URLProperties docm; - private URLProperties docmVersion; + private URLProperties cosmic; + private URLProperties hgmd; private URLProperties dgv; private URLProperties simpleRepeats; private URLProperties windowMasker; @@ -76,6 +67,8 @@ public class DownloadProperties { private URLProperties revel; private URLProperties pubmed; private URLProperties pharmGKB; + private URLProperties alphaMissense; + private URLProperties pgs; public EnsemblProperties getEnsembl() { return ensembl; @@ -131,15 +124,6 @@ public DownloadProperties setMirbase(URLProperties mirbase) { return this; } - public URLProperties getMirbaseReadme() { - return mirbaseReadme; - } - - public DownloadProperties setMirbaseReadme(URLProperties mirbaseReadme) { - this.mirbaseReadme = mirbaseReadme; - return this; - } - public URLProperties getTargetScan() { return targetScan; } @@ -203,12 +187,21 @@ public DownloadProperties setInterproRelNotes(URLProperties interproRelNotes) { return this; } - public URLProperties getConservation() { - return conservation; + public URLProperties getPhastCons() { + return phastCons; + } + + public DownloadProperties setPhastCons(URLProperties phastCons) { + this.phastCons = phastCons; + return this; + } + + public URLProperties getPhylop() { + return phylop; } - public DownloadProperties setConservation(URLProperties conservation) { - this.conservation = conservation; + public DownloadProperties setPhylop(URLProperties phylop) { + this.phylop = phylop; return this; } @@ -230,65 +223,24 @@ public DownloadProperties setClinvar(URLProperties clinvar) { return this; } - public URLProperties getClinvarVariation() { - return clinvarVariation; + public URLProperties getCosmic() { + return cosmic; } - public DownloadProperties setClinvarVariation(URLProperties clinvarVariation) { - this.clinvarVariation = clinvarVariation; + public DownloadProperties setCosmic(URLProperties cosmic) { + this.cosmic = cosmic; return this; } - public URLProperties getClinvarSummary() { - return clinvarSummary; + public URLProperties getHgmd() { + return hgmd; } - public DownloadProperties setClinvarSummary(URLProperties clinvarSummary) { - this.clinvarSummary = clinvarSummary; + public DownloadProperties setHgmd(URLProperties hgmd) { + this.hgmd = hgmd; return this; } - public URLProperties getClinvarVariationAllele() { - return clinvarVariationAllele; - } - - public void setClinvarVariationAllele(URLProperties clinvarVariationAllele) { - this.clinvarVariationAllele = clinvarVariationAllele; - } - - public URLProperties getClinvarEfoTerms() { - return clinvarEfoTerms; - } - - public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) { - this.clinvarEfoTerms = clinvarEfoTerms; - return this; - } - - public URLProperties getIarctp53() { - return iarctp53; - } - - public void setIarctp53(URLProperties iarctp53) { - this.iarctp53 = iarctp53; - } - - public URLProperties getDocm() { - return docm; - } - - public void setDocm(URLProperties docm) { - this.docm = docm; - } - - public URLProperties getDocmVersion() { - return docmVersion; - } - - public void setDocmVersion(URLProperties docmVersion) { - this.docmVersion = docmVersion; - } - public URLProperties getDgv() { return dgv; } @@ -447,19 +399,6 @@ public DownloadProperties setRefSeq(URLProperties refSeq) { return this; } - public URLProperties getRefSeqFasta() { - return refSeqFasta; - } - - public DownloadProperties setRefSeqFasta(URLProperties refSeqFasta) { - this.refSeqFasta = refSeqFasta; - return this; - } - - public URLProperties getRefSeqProteinFasta() { - return refSeqProteinFasta; - } - public URLProperties getRevel() { return revel; } @@ -487,17 +426,21 @@ public DownloadProperties setPharmGKB(URLProperties pharmGKB) { return this; } - public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { - this.refSeqProteinFasta = refSeqProteinFasta; + public URLProperties getAlphaMissense() { + return alphaMissense; + } + + public DownloadProperties setAlphaMissense(URLProperties alphaMissense) { + this.alphaMissense = alphaMissense; return this; } - public URLProperties getRefSeqCdna() { - return refSeqCdna; + public URLProperties getPgs() { + return pgs; } - public DownloadProperties setRefSeqCdna(URLProperties refSeqCdna) { - this.refSeqCdna = refSeqCdna; + public DownloadProperties setPgs(URLProperties pgs) { + this.pgs = pgs; return this; } @@ -572,7 +515,7 @@ public static class URLProperties { private String host; private String version; - private List files; + private Map files; public String getHost() { return host; @@ -591,14 +534,13 @@ public URLProperties setVersion(String version) { return this; } - public List getFiles() { + public Map getFiles() { return files; } - public URLProperties setFiles(List files) { + public URLProperties setFiles(Map files) { this.files = files; return this; } - } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java index 884c63f2ae..422a52b0d4 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java @@ -22,5 +22,8 @@ public CellBaseException(String msg) { super(msg); } + public CellBaseException(String msg, Throwable e) { + super(msg, e); + } } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java index 5674839aa8..47a694c5d8 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataRelease.java @@ -21,14 +21,9 @@ public class DataRelease { private int release; private String date; - /** - * @deprecated it is maintained to back-compatibility with previous CellBase versions to v5.5 - */ - @Deprecated - private boolean active; private List activeByDefaultIn; private Map collections; - private List sources; + private List sources; public DataRelease() { this.activeByDefaultIn = Collections.emptyList(); @@ -37,7 +32,7 @@ public DataRelease() { } public DataRelease(int release, String date, List activeByDefaultIn, Map collections, - List sources) { + List sources) { this.release = release; this.date = date; this.activeByDefaultIn = activeByDefaultIn; @@ -75,15 +70,6 @@ public DataRelease setDate(String date) { return this; } - public boolean isActive() { - return active; - } - - public DataRelease setActive(boolean active) { - this.active = active; - return this; - } - public List getActiveByDefaultIn() { return activeByDefaultIn; } @@ -102,11 +88,11 @@ public DataRelease setCollections(Map collections) { return this; } - public List getSources() { + public List getSources() { return sources; } - public DataRelease setSources(List sources) { + public DataRelease setSources(List sources) { this.sources = sources; return this; } diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java deleted file mode 100644 index 3a42de9374..0000000000 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataReleaseSource.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.core.models; - -import java.util.List; -import java.util.Objects; - -public class DataReleaseSource { - private String name; - private String version; - private String data; - private String date; - private List url; - - public DataReleaseSource() { - } - - public DataReleaseSource(String name, String version, String data, String date, List url) { - this.name = name; - this.version = version; - this.data = data; - this.date = date; - this.url = url; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("DataReleaseSource{"); - sb.append("name='").append(name).append('\''); - sb.append(", version='").append(version).append('\''); - sb.append(", data='").append(data).append('\''); - sb.append(", date='").append(date).append('\''); - sb.append(", url=").append(url); - sb.append('}'); - return sb.toString(); - } - - public String getName() { - return name; - } - - public DataReleaseSource setName(String name) { - this.name = name; - return this; - } - - public String getVersion() { - return version; - } - - public DataReleaseSource setVersion(String version) { - this.version = version; - return this; - } - - public String getData() { - return data; - } - - public DataReleaseSource setData(String data) { - this.data = data; - return this; - } - - public String getDate() { - return date; - } - - public DataReleaseSource setDate(String date) { - this.date = date; - return this; - } - - public List getUrl() { - return url; - } - - public DataReleaseSource setUrl(List url) { - this.url = url; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - DataReleaseSource that = (DataReleaseSource) o; - return Objects.equals(name, that.name) - && Objects.equals(version, that.version) - && Objects.equals(data, that.data) - && Objects.equals(date, that.date) - && Objects.equals(url, that.url); - } - - @Override - public int hashCode() { - return Objects.hash(name, version, data, date, url); - } -} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java new file mode 100644 index 0000000000..f716412a03 --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/models/DataSource.java @@ -0,0 +1,98 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.models; + +import java.util.ArrayList; +import java.util.List; + +public class DataSource { + + private String name; + private String category; + private String version; + private String downloadDate; + private List urls; + + public DataSource() { + this.urls = new ArrayList<>(); + } + + public DataSource(String name, String category, String version, String downloadDate, List urls) { + this.name = name; + this.category = category; + this.version = version; + this.downloadDate = downloadDate; + this.urls = urls; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("DataSourceDescr{"); + sb.append("name='").append(name).append('\''); + sb.append(", category='").append(category).append('\''); + sb.append(", version='").append(version).append('\''); + sb.append(", downloadDate='").append(downloadDate).append('\''); + sb.append(", urls=").append(urls); + sb.append('}'); + return sb.toString(); + } + + public String getName() { + return name; + } + + public DataSource setName(String name) { + this.name = name; + return this; + } + + public String getCategory() { + return category; + } + + public DataSource setCategory(String category) { + this.category = category; + return this; + } + + public String getVersion() { + return version; + } + + public DataSource setVersion(String version) { + this.version = version; + return this; + } + + public String getDownloadDate() { + return downloadDate; + } + + public DataSource setDownloadDate(String downloadedDate) { + this.downloadDate = downloadedDate; + return this; + } + + public List getUrls() { + return urls; + } + + public DataSource setUrls(List urls) { + this.urls = urls; + return this; + } +} diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index f24827532c..7b5e60b91c 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -45,6 +45,7 @@ server: port: "${CELLBASE.SERVER.REST.PORT}" defaultOutdir: "/tmp" download: + ## Genomic and Gene information ensembl: database: host: ensembldb.ensembl.org:3306 @@ -52,7 +53,16 @@ download: password: '' libs: "${CELLBASE.ENSEMBL.LIBS}" url: - host: ftp://ftp.ensembl.org/pub + host: https://ftp.ensembl.org/pub/ + files: + # New Homo sapiens assemblies contain too many ALT regions, so we download 'primary_assembly' file instead + PRIMARY_FA: "release-put_release_here/fasta/put_species_here/dna/put_capital_species_here.put_assembly_here.dna.primary_assembly.fa.gz" + GTF: "release-put_release_here/gtf/put_species_here/put_capital_species_here.put_assembly_here.put_release_here.gtf.gz" + PEP_FA: "release-put_release_here/fasta/put_species_here/pep/put_capital_species_here.put_assembly_here.pep.all.fa.gz" + CDNA_FA: "release-put_release_here/fasta/put_species_here/cdna/put_capital_species_here.put_assembly_here.cdna.all.fa.gz" + REGULATORY_BUILD: "release-put_release_here/regulation/put_species_here/put_species_here.put_assembly_here.Regulatory_Build.regulatory_features.20221007.gff.gz" + MOTIF_FEATURES: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz" + MOTIF_FEATURES_INDEX: "release-put_release_here/regulation/put_species_here/MotifFeatures/put_species_here.put_assembly_here.motif_features.gff.gz.tbi" ensemblGenomes: database: host: mysql-eg-publicsql.ebi.ac.uk:4157 @@ -61,164 +71,239 @@ download: libs: "${CELLBASE.ENSEMBL.LIBS}" url: host: ftp://ftp.ensemblgenomes.org/pub - hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" refSeq: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz - refSeqFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz - refSeqProteinFasta: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz - refSeqCdna: - host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz + host: https://ftp.ncbi.nih.gov/refseq/ + version: "2023-10-11" + files: + GENOMIC_GTF: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz + GENOMIC_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz + PROTEIN_FAA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_protein.faa.gz + RNA_FNA: H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz - version: "1.1" + host: https://ftp.ncbi.nlm.nih.gov/refseq/ + version: "1.2" + files: + MANE_SELECT: MANE/MANE_human/release_1.2/MANE.GRCh38.v1.2.summary.txt.gz lrg: - host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + host: http://ftp.ebi.ac.uk/ version: "2021-03-30" + files: + LRG: pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt + hgnc: + host: https://ftp.ebi.ac.uk/ + version: "2024-04-01" + files: + HGNC: pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2024-04-01.txt + cancerHotspot: + host: https://www.cancerhotspots.org/ + version: "v2" + files: + CANCER_HOTSPOT: files/hotspots_v2.xls + dgidb: + host: https://old.dgidb.org/ + version: "2022-02-01" + files: + DGIDB: data/monthly_tsvs/2022-Feb/interactions.tsv geneUniprotXref: - host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ - version: "2023-11-08" + host: http://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT_XREF: pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz geneExpressionAtlas: - host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + host: https://ftp.ebi.ac.uk/ + version: "2.0.14" + files: + GENE_EXPRESSION_ATLAS: pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz + hpo: + ## NOTE: Download manually from here now + version: "2024-04-26" + host: https://hpo.jax.org/app/data/annotations + disgenet: + host: https://www.disgenet.org/ + version: "7.0 (January 2020)" + files: + DISGENET: static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz + gnomadConstraints: + host: https://storage.googleapis.com/ + version: "2.1.1" + files: + GNOMAD_CONSTRAINTS: gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz + goAnnotation: + host: http://geneontology.org/ + files: + GO_ANNOTATION: gene-associations/goa_human.gaf.gz + pgs: + host: https://www.pgscatalog.org/ + version: "Dec. 15, 2023" + files: + PGS_METADATA: https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata_scores.csv + + ## Regulation mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/ + version: "22.1" + files: + MIRBASE: download/miRNA.dat targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx + host: https://mirtarbase.cuhk.edu.cn/ version: "9.0" + files: + # This file contains errors and has to be fixed before building + # check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh + MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx ## Protein Data uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" - uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + host: https://ftp.uniprot.org/ + version: "2024-03-27" + files: + UNIPROT: pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" - interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt + host: https://ftp.ebi.ac.uk/ + version: "2024-03-27" + files: + INTERPRO: pub/databases/interpro/current_release/protein2ipr.dat.gz intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" + host: https://ftp.ebi.ac.uk/ + version: "2024-02-16" + files: + INTACT: pub/databases/intact/current/psimitab/intact.txt ## Conservation Scores - conservation: - host: https://hgdownload.cse.ucsc.edu/goldenPath/ + phastCons: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ + version: "2022-08-30" + files: + PHASTCONS: goldenPath/put_assembly_here/phastCons470way/put_assembly_here.470way.phastCons/chrput_chromosome_here.phastCons470way.wigFix.gz + phylop: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38; and put_chromosome_here by the chromosomes: 1,2,..X,Y,M + host: https://hgdownload.cse.ucsc.edu/ version: "2022-08-30" + files: + PHYLOP: goldenPath/put_assembly_here/phyloP470way/put_assembly_here.470way.phyloP/chrput_chromosome_here.phyloP470way.wigFix.gz gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + host: http://ftp.ensembl.org/ version: "2023-05-17" + files: + GERP: pub/release-111/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw + + ## Clinical Variant clinvar: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz - version: "2023-12-01" - clinvarVariation: -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz -# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz - clinvarSummary: - host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" - clinvarVariationAllele: - host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" - clinvarEfoTerms: - host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv - iarctp53: - host: http://p53.iarc.fr/ajax/Zipper.ashx - docm: - host: http://docm.info/api/ - docmVersion: - host: http://docm.info + host: https://ftp.ncbi.nlm.nih.gov/ + version: "2024-02" + files: + FULL_RELEASE: pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-02.xml.gz + SUMMARY: pub/clinvar/tab_delimited/variant_summary.txt.gz + ALLELE: pub/clinvar/tab_delimited/variation_allele.txt.gz + EFO_TERMS: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + cosmic: + ## To be downloaded manually + host: https://cancer.sanger.ac.uk/cosmic/ + version: "v99" + files: + COSMIC: CosmicMutantExport.tsv.gz + hgmd: + ## To be downloaded manually + host: https://www.hgmd.cf.ac.uk/ + version: "2020-03" + files: + HGMD: hgmd.vcf + gwasCatalog: + ## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e111_r2024-04-22' + host: https://ftp.ebi.ac.uk/ + version: "2024-04-22" + files: + GWAS: pub/databases/gwas/releases/2024/04/22/gwas-catalog-associations_ontology-annotated.tsv + DBSNP: All.vcf.gz + dgv: host: http://dgv.tcag.ca/v106/docs simpleRepeats: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz windowMasker: - host: http://hgdownload.cse.ucsc.edu/goldenPath + host: http://hgdownload.cse.ucsc.edu/ + files: + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz genomicSuperDups: - host: http://hgdownload.cse.ucsc.edu/goldenPath - gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv - version: "23-12-21" - hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations - host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt - disgenet: - host: https://www.disgenet.org/static/disgenet_ap1/files/downloads + host: http://hgdownload.cse.ucsc.edu/ files: - - all_gene_disease_associations.tsv.gz - - readme.txt - dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" + ## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38 + GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz + + ## Variant Pathogenic Prediction + revel: + host: https://zenodo.org/ + version: "1.3" + files: + REVEL: record/7072866/files/revel-v1.3_all_chromosomes.zip + alphaMissense: + host: https://github.com/google-deepmind/alphamissense + version: "Aug. 3, 2023" + files: + ALPHAMISSENSE: https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" + host: https://krishna.gs.washington.edu/ + version: "1.7" + files: + CADD: download/CADD/v1.7/GRCh38/whole_genome_SNVs.tsv.gz + reactome: host: http://www.reactome.org/download/current/biopax.zip - gnomadConstraints: - host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" + + ## OBO Ontologies hpoObo: - host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file + files: + HPO: hp.obo goObo: - host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file + files: + GO: go/go-basic.obo doidObo: - host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file + files: + DOID: doid.obo mondoObo: - host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" - goAnnotation: - host: http://geneontology.org/gene-associations/goa_human.gaf.gz - revel: - host: https://zenodo.org/record/7072866/files/revel-v1.3_all_chromosomes.zip + host: http://purl.obolibrary.org/obo/ + ## The version is retrieved from the OBO file + files: + MONDO: mondo.obo + + ## Others pubmed: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ + version: 2024 files: - - pubmed22n[1..1114..4].xml.gz + PUBMED_REGEX: pubmed24n[1..1219..4].xml.gz pharmGKB: - host: https://www.pharmgkb.org/downloads + host: https://api.pharmgkb.org/v1/download/file/data/ version: v1 files: - - https://api.pharmgkb.org/v1/download/file/data/genes.zip - - https://api.pharmgkb.org/v1/download/file/data/chemicals.zip - - https://api.pharmgkb.org/v1/download/file/data/variants.zip - - https://api.pharmgkb.org/v1/download/file/data/guidelineAnnotations.json.zip - - https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip - - https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip - - https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip - - https://api.pharmgkb.org/v1/download/file/data/drugLabels.zip - - https://api.pharmgkb.org/v1/download/file/data/relationships.zip + GENES: genes.zip + CHEMICALS: chemicals.zip + VARIANTS: variants.zip + GUIDELINE_ANNOTATIONS: guidelineAnnotations.json.zip + VARIANT_ANNOTATIONS: variantAnnotations.zip + CLINICAL_ANNOTATIONS: clinicalAnnotations.zip + CLINICAL_VARIANTS: clinicalVariants.zip + DRUG_LABELS: drugLabels.zip + RELATIONSHIPS: relationships.zip species: vertebrates: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '111_38' name: GRCh38 - ensemblVersion: '82_37' name: GRCh37 diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml index 71964bb36e..50fb973a8c 100644 --- a/cellbase-lib/pom.xml +++ b/cellbase-lib/pom.xml @@ -185,6 +185,11 @@ junit-platform-engine test + + org.apache.commons + commons-csv + 1.0 + diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 124ac6e6fc..9abfac8b6b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -16,9 +16,12 @@ package org.opencb.cellbase.lib; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.config.Configurator; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.download.DownloadFile; import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,120 +29,485 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; /** * Created by fjlopez on 03/06/16. */ -public class EtlCommons { +public final class EtlCommons { + // Commons + public static final String XLSX_EXTENSION = ".xlsx"; + public static final String CSV_EXTENSION = ".csv"; + public static final String TBI_EXTENSION = ".tbi"; + public static final String FAI_EXTENSION = ".fai"; + + public static final String OK_LOG_MESSAGE = "Ok."; + + // Ensembl + public static final String ENSEMBL_DATA = "ensembl"; + public static final String PUT_RELEASE_HERE_MARK = "put_release_here"; + public static final String PUT_SPECIES_HERE_MARK = "put_species_here"; + public static final String PUT_CAPITAL_SPECIES_HERE_MARK = "put_capital_species_here"; + public static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here"; + public static final String PUT_CHROMOSOME_HERE_MARK = "put_chromosome_here"; + // Must match the configuration file + public static final String ENSEMBL_PRIMARY_FA_FILE_ID = "PRIMARY_FA"; + public static final String ENSEMBL_GTF_FILE_ID = "GTF"; + public static final String ENSEMBL_PEP_FA_FILE_ID = "PEP_FA"; + public static final String ENSEMBL_CDNA_FA_FILE_ID = "CDNA_FA"; + public static final String ENSEMBL_REGULATORY_BUILD_FILE_ID = "REGULATORY_BUILD"; + public static final String ENSEMBL_MOTIF_FEATURES_FILE_ID = "MOTIF_FEATURES"; + public static final String ENSEMBL_MOTIF_FEATURES_INDEX_FILE_ID = "MOTIF_FEATURES_INDEX"; + + public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; + public static final String HSAPIENS_NAME= "hsapiens"; + + public static final String GRCH38_NAME = "GRCh38"; + public static final String GRCH37_NAME = "GRCh37"; + public static final String HG38_NAME = "hg38"; + public static final String HG19_NAME = "hg19"; + + public static final String SUFFIX_VERSION_FILENAME = "Version.json"; + + // Genome public static final String GENOME_DATA = "genome"; + + // Gene public static final String GENE_DATA = "gene"; + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; + public static final String GENE_ANNOTATION_DATA = "gene_annotation"; + public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; + + // RefSeq public static final String REFSEQ_DATA = "refseq"; - public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association"; + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; + // Must match the configuration file + public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; + public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; + public static final String REFSEQ_PROTEIN_FAA_FILE_ID = "PROTEIN_FAA"; + public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; + + // Gene annotation + // - MANE Select + public static final String MANE_SELECT_DATA = "MANE Select"; + // Must match the configuration file + public static final String MANE_SELECT_FILE_ID = "MANE_SELECT"; + // - LRG + public static final String LRG_DATA = "lrg"; + // Must match the configuration file + public static final String LRG_FILE_ID = "LRG"; + // - HGNC + public static final String HGNC_DATA = "hgnc"; + // Must match the configuration file + public static final String HGNC_FILE_ID = "HGNC"; + // - Cancer HotSpot + public static final String CANCER_HOTSPOT_DATA = "cancer_hotspot"; + // Must match the configuration file + public static final String CANCER_HOTSPOT_FILE_ID = "CANCER_HOTSPOT"; + // - DGID (drug) + public static final String DGIDB_DATA = "dgidb"; + // Must match the configuration file + public static final String DGIDB_FILE_ID = "DGIDB"; + // - UniProt Xref + public static final String UNIPROT_XREF_DATA = "uniprot_xref"; + // Must match the configuration file + public static final String UNIPROT_XREF_FILE_ID = "UNIPROT_XREF"; + // - Gene Expression Atlas + public static final String GENE_EXPRESSION_ATLAS_DATA = "gene_expression_atlas"; + // Must match the configuration file + public static final String GENE_EXPRESSION_ATLAS_FILE_ID = "GENE_EXPRESSION_ATLAS"; + // - Gene Disease Annotation + public static final String GENE_DISEASE_ANNOTATION_NAME = "Gene Disease Annotation"; + // - HPO + public static final String HPO_DATA = "hpo"; + // - DISGENET + public static final String DISGENET_DATA = "disgenet"; + // Must match the configuration file + public static final String DISGENET_FILE_ID = "DISGENET"; + // - gnomAD Constraints + public static final String GNOMAD_CONSTRAINTS_DATA = "gnomad_constraints"; + // Must match the configuration file + public static final String GNOMAD_CONSTRAINTS_FILE_ID = "GNOMAD_CONSTRAINTS"; + // - GO Annotation + public static final String GO_ANNOTATION_DATA = "go_annotation"; + // Must match the configuration file + public static final String GO_ANNOTATION_FILE_ID = "GO_ANNOTATION"; + public static final String VARIATION_DATA = "variation"; - public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; - public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; - public static final String REGULATION_DATA = "regulation"; - public static final String PROTEIN_DATA = "protein"; - public static final String CONSERVATION_DATA = "conservation"; - public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + // PGS (polygenic scores) + public static final String PGS_NAME = "Polygenic Scores"; + public static final String PGS_DATA = "polygenic_score"; + public static final String PGS_COMMON_COLLECTION = "common_polygenic_scores"; + public static final String PGS_VARIANT_COLLECTION = "variant_polygenic_scores"; + // PGS Catalog + public static final String PGS_CATALOG_NAME = "PGS Catalog"; + public static final String PGS_CATALOG_VERSION_FILENAME = "pgsCatalog" + SUFFIX_VERSION_FILENAME; + // Must match the configuration file + public static final String PGS_CATALOG_METADATA_FILE_ID = "PGS_METADATA"; + + // Pharmacogenomics public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; - public static final String PHARMGKB_NAME = "PharmGKB"; + // PharmGKB public static final String PHARMGKB_DATA = "pharmgkb"; - public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; - - public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2022.11"; - public static final String CLINVAR_DATE = "2022-11"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; - public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; - public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; - public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; - public static final String IARCTP53_FILE = "IARC-TP53.zip"; - public static final String GWAS_FILE = "gwas_catalog.tsv"; - public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; - public static final String DBSNP_FILE = "All.vcf.gz"; - - public static final String STRUCTURAL_VARIANTS_DATA = "svs"; - public static final String REPEATS_DATA = "repeats"; - public static final String OBO_DATA = "ontology"; - public static final String HPO_FILE = "hp.obo"; - public static final String GO_FILE = "go-basic.obo"; - public static final String DOID_FILE = "doid.obo"; - public static final String MONDO_FILE = "mondo.obo"; - public static final String PFM_DATA = "regulatory_pfm"; + // Must match the configuration file + public static final String PHARMGKB_GENES_FILE_ID = "GENES"; + public static final String PHARMGKB_CHEMICALS_FILE_ID = "CHEMICALS"; + public static final String PHARMGKB_VARIANTS_FILE_ID = "VARIANTS"; + public static final String PHARMGKB_GUIDELINE_ANNOTATIONS_FILE_ID = "GUIDELINE_ANNOTATIONS"; + public static final String PHARMGKB_VARIANT_ANNOTATIONS_FILE_ID = "VARIANT_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_ANNOTATIONS_FILE_ID = "CLINICAL_ANNOTATIONS"; + public static final String PHARMGKB_CLINICAL_VARIANTS_FILE_ID = "CLINICAL_VARIANTS"; + public static final String PHARMGKB_DRUG_LABELS_FILE_ID = "DRUG_LABELS"; + public static final String PHARMGKB_RELATIONSHIPS_FILE_ID = "RELATIONSHIPS"; - // Build specific data options - public static final String GENOME_INFO_DATA = "genome_info"; - public static final String DISGENET_DATA = "disgenet"; - public static final String HPO_DATA = "hpo"; - public static final String CADD_DATA = "cadd"; - public static final String PPI_DATA = "ppi"; - public static final String DRUG_DATA = "drug"; + // Missense variantion functional score + public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score"; + + // Clinical variants data + public static final String CLINICAL_VARIANT_DATA = "clinical_variant"; + public static final String CLINICAL_VARIANTS_BASENAME = "clinicalVariant"; + // ClinVar public static final String CLINVAR_DATA = "clinvar"; - public static final String DOCM_DATA = "docm"; + public static final String CLINVAR_CHUNKS_SUBDIRECTORY = "clinvar_chunks"; + // Must match the configuration file + public static final String CLINVAR_FULL_RELEASE_FILE_ID = "FULL_RELEASE"; + public static final String CLINVAR_SUMMARY_FILE_ID = "SUMMARY"; + public static final String CLINVAR_ALLELE_FILE_ID = "ALLELE"; + public static final String CLINVAR_EFO_TERMS_FILE_ID = "EFO_TERMS"; + // COSMIC public static final String COSMIC_DATA = "cosmic"; - public static final String GWAS_DATA = "gwas"; - public static final String IARCTP53_GERMLINE_FILE = "germlineMutationDataIARC TP53 Database, R20.txt"; - public static final String IARCTP53_GERMLINE_REFERENCES_FILE = "germlineMutationReferenceIARC TP53 Database, R20.txt"; - public static final String IARCTP53_SOMATIC_FILE = "somaticMutationDataIARC TP53 Database, R20.txt"; - public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt"; + // Must match the configuration file + public static final String COSMIC_FILE_ID = "COSMIC"; + // HGMD public static final String HGMD_DATA = "hgmd"; + // Must match the configuration file + public static final String HGMD_FILE_ID = "HGMD"; + // GWAS + public static final String GWAS_DATA = "gwas"; + // Must match the configuration file + public static final String GWAS_FILE_ID = "GWAS"; + public static final String GWAS_DBSNP_FILE_ID = "DBSNP"; - public static final String PUBMED_DATA = "pubmed"; + // Repeats + public static final String REPEATS_DATA = "repeats"; + public static final String REPEATS_BASENAME = "repeats"; + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ + @Deprecated + public static final String REPEATS_JSON = "repeats"; + // Simple repeats + public static final String TRF_DATA = "trf"; + // Must match the configuration file + public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS"; + // Genomic super duplications + public static final String GSD_DATA = "gsd"; + // Must match the configuration file + public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS"; + // Window masker + public static final String WM_DATA = "wm"; + // Must match the configuration file + public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER"; + + // Ontology + public static final String ONTOLOGY_DATA = "ontology"; + public static final String OBO_BASENAME = "ontology"; + // HPO + public static final String HPO_OBO_DATA = "hpo"; + // Must match the configuration file + public static final String HPO_OBO_FILE_ID = "HPO"; + // GO + public static final String GO_OBO_DATA = "go"; + // Must match the configuration file + public static final String GO_OBO_FILE_ID = "GO"; + // DOID + public static final String DOID_OBO_DATA = "doid"; + // Must match the configuration file + public static final String DOID_OBO_FILE_ID = "DOID"; + // MONDO + public static final String MONDO_OBO_DATA = "mondo"; + // Must match the configuration file + public static final String MONDO_OBO_FILE_ID = "MONDO"; + + + public static final String PFM_DATA = "regulatory_pfm"; + + // Variation functional score + public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score"; + // CADD scores + public static final String CADD_DATA = "cadd"; + public static final String CADD_RAW_DATA = "cadd_raw"; + public static final String CADD_SCALED_DATA = "cadd_scaled"; + // Must match the configuration file + public static final String CADD_FILE_ID = "CADD"; + + // Regulation + public static final String REGULATION_DATA = "regulation"; + public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm"; + public static final String REGULATORY_REGION_BASENAME = "regulatory_region"; + // Regulatory build and motif features (see Ensembl files: regulatory build and motif features files) + public static final String REGULATORY_BUILD_DATA = "regulatory_build"; + // Motif features (see Ensembl files) + public static final String MOTIF_FEATURES_DATA = "motif_features"; + // miRBase + public static final String MIRBASE_DATA = "mirbase"; + // Must match the configuration file + public static final String MIRBASE_FILE_ID = "MIRBASE"; + // miRTarBase + public static final String MIRTARBASE_DATA = "mirtarbase"; + // Must match the configuration file + public static final String MIRTARBASE_FILE_ID = "MIRTARBASE"; - // Load specific data options + // Protein substitution predictions consist of sift, polyphen, revel and alphamissense + public static final String PROTEIN_SUBSTITUTION_PREDICTION_DATA = "protein_substitution_predictions"; + // Sift and polyphen public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction"; + public static final String SIFT_SOURCE_NAME = "Sift"; + public static final String POLYPHEN_SOURCE_NAME = "PolyPhen"; + public static final String SIFT_VERSION_FILENAME = "siftVersion.json"; + public static final String POLYPHEN_VERSION_FILENAME = "polyphenVersion.json"; + // Revel + public static final String REVEL_DATA = "revel"; + // Must match the configuration file + public static final String REVEL_FILE_ID = "REVEL"; + @Deprecated + public static final String MISSENSE_VARIATION_SCORE_JSON_FILENAME = "missense_variation_functional_score.json.gz"; + @Deprecated + public static final String REVEL_RAW_FILENAME = "revel-v1.3_all_chromosomes.zip"; + @Deprecated + public static final String REVEL_JSON_FILENAME = "revel-v1.3_all_chromosomes.json.gz"; + @Deprecated + public static final String REVEL_VERSION_FILENAME = "revelVersion.json"; + // AlphaMissense + public static final String ALPHAMISSENSE_DATA = "alphamissense"; + // Must match the configuration file + public static final String ALPHAMISSENSE_FILE_ID = "ALPHAMISSENSE"; + @Deprecated + public static final String ALPHAMISSENSE_RAW_FILENAME = "AlphaMissense_hg38.tsv.gz"; + @Deprecated + public static final String ALPHAMISSENSE_JSON_FILENAME = "alphamissense_hg38.json.gz"; + @Deprecated + public static final String ALPHAMISSENSE_VERSION_FILENAME = "alphamissenseVersion.json"; - // Path and file names - public static final String GERP_SUBDIRECTORY = "gerp"; + // Protein + public static final String PROTEIN_DATA = "protein"; + // UniProt + public static final String UNIPROT_DATA = "uniprot"; + public static final String UNIPROT_CHUNKS_SUBDIRECTORY = "uniprot_chunks"; + // Must match the configuration file + public static final String UNIPROT_FILE_ID = "UNIPROT"; + // InterPro + public static final String INTERPRO_DATA = "interpro"; + // Must match the configuration file + public static final String INTERPRO_FILE_ID = "INTERPRO"; + // IntAct + public static final String INTACT_DATA = "intact"; + // Must match the configuration file + public static final String INTACT_FILE_ID = "INTACT"; + + // Conservation scores + public static final String CONSERVATION_DATA = "conservation"; + // GERP + public static final String GERP_DATA = "gerp"; + // Must match the configuration file + public static final String GERP_FILE_ID = "GERP"; + // PHASTCONS + public static final String PHASTCONS_DATA = "phastCons"; + // Must match the configuration file + public static final String PHASTCONS_FILE_ID = "PHASTCONS"; + // PHYLOP + public static final String PHYLOP_DATA = "phylop"; + // Must match the configuration file + public static final String PHYLOP_FILE_ID = "PHYLOP"; + + // Splice scores public static final String MMSPLICE_SUBDIRECTORY = "mmsplice"; - public static final String MMSPLICE_VERSION_FILENAME = "mmspliceVersion.json"; + public static final String MMSPLICE_VERSION_FILENAME = MMSPLICE_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; public static final String SPLICEAI_SUBDIRECTORY = "spliceai"; - public static final String SPLICEAI_VERSION_FILENAME = "spliceaiVersion.json"; + public static final String SPLICEAI_VERSION_FILENAME = SPLICEAI_SUBDIRECTORY + SUFFIX_VERSION_FILENAME; - // binary bigwig file + /** + * @deprecated (when refactoring downloaders, builders and loaders) + */ + @Deprecated public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw"; - // bigwig file manually transformed to bedGraph file - public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz"; public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz"; public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz"; - public static final String DOCM_FILE = "docm.json.gz"; public static final String DOCM_NAME = "DOCM"; - public static final String STRUCTURAL_VARIANTS_FOLDER = "structuralVariants"; - public static final String DGV_FILE = "dgv.txt"; - public static final String DGV_VERSION_FILE = "dgvVersion.json"; - public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants"; - public static final String TRF_FILE = "simpleRepeat.txt.gz"; - public static final String TRF_VERSION_FILE = "simpleRepeat.json"; - public static final String GSD_FILE = "genomicSuperDups.txt.gz"; - public static final String GSD_VERSION_FILE = "genomicSuperDups.json"; - public static final String WM_FILE = "windowMasker.txt.gz"; - public static final String WM_VERSION_FILE = "windowMasker.json"; - public static final String REPEATS_FOLDER = "genome"; - public static final String REPEATS_JSON = "repeats"; - public static final String OBO_JSON = "ontology"; - public static final String HPO_VERSION_FILE = "hpoVersion.json"; - public static final String GO_VERSION_FILE = "goVersion.json"; - public static final String DO_VERSION_FILE = "doVersion.json"; + public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME; + public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME; + public static final String DO_VERSION_FILE = "do" + SUFFIX_VERSION_FILENAME; + public static final String MONDO_VERSION_FILE = "mondo" + SUFFIX_VERSION_FILENAME; + public static final String HGMD_FILE = "hgmd.vcf"; - public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json"; - public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; - public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + // PubMed + public static final String PUBMED_DATA = "pubmed"; + // Must match the configuration file + public static final String PUBMED_REGEX_FILE_ID = "PUBMED_REGEX"; + + // Utilities maps + private static Map dataNamesMap = new HashMap<>(); + private static Map dataCategoriesMap = new HashMap<>(); + private static Map dataVersionFilenamesMap = new HashMap<>(); + + static { + + // Populate data names map + dataNamesMap.put(ENSEMBL_DATA, "Ensembl"); + dataNamesMap.put(REFSEQ_DATA, "RefSeq"); + dataNamesMap.put(GENOME_DATA, "Genome"); + dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); + dataNamesMap.put(LRG_DATA, "LRG"); + dataNamesMap.put(HGNC_DATA, "HGNC Gene"); + dataNamesMap.put(CANCER_HOTSPOT_DATA, "Cancer HotSpot"); + dataNamesMap.put(DGIDB_DATA, "DGIdb"); + dataNamesMap.put(UNIPROT_XREF_DATA, "UniProt Xref"); + dataNamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "Gene Expression Atlas"); + dataNamesMap.put(GENE_DISEASE_ANNOTATION_DATA, "Gene Disease Annotation"); + dataNamesMap.put(HPO_DATA, "HPO"); + dataNamesMap.put(DISGENET_DATA, "DisGeNet"); + dataNamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomAD Constraint"); + dataNamesMap.put(GO_ANNOTATION_DATA, "EBI Gene Ontology Annotation"); + dataNamesMap.put(PROTEIN_DATA, "Protein"); + dataNamesMap.put(UNIPROT_DATA, "UniProt"); + dataNamesMap.put(INTERPRO_DATA, "InterPro"); + dataNamesMap.put(INTACT_DATA, "IntAct"); + dataNamesMap.put(CONSERVATION_DATA, "Conservation"); + dataNamesMap.put(GERP_DATA, "GERP++"); + dataNamesMap.put(PHASTCONS_DATA, "PhastCons"); + dataNamesMap.put(PHYLOP_DATA, "PhyloP"); + dataNamesMap.put(REPEATS_DATA, "Repeats"); + dataNamesMap.put(TRF_DATA, "Tandem Repeats Finder"); + dataNamesMap.put(WM_DATA, "Window Masker"); + dataNamesMap.put(GSD_DATA, "Genomic Super Duplications"); + dataNamesMap.put(REGULATION_DATA, "Regulation"); + dataNamesMap.put(REGULATORY_BUILD_DATA, "Regulatory Build"); + dataNamesMap.put(MOTIF_FEATURES_DATA, "Motif Features"); + dataNamesMap.put(MIRBASE_DATA, "miRBase"); + dataNamesMap.put(MIRTARBASE_DATA, "miRTarBase"); + dataNamesMap.put(ONTOLOGY_DATA, "Ontology"); + dataNamesMap.put(HPO_OBO_DATA, "HPO"); + dataNamesMap.put(GO_OBO_DATA, "GO"); + dataNamesMap.put(DOID_OBO_DATA, "DOID"); + dataNamesMap.put(MONDO_OBO_DATA, "Mondo"); + dataNamesMap.put(PUBMED_DATA, "PubMed"); + dataNamesMap.put(PHARMACOGENOMICS_DATA, "Pharmacogenomics"); + dataNamesMap.put(PHARMGKB_DATA, "PharmGKB"); + dataNamesMap.put(VARIATION_FUNCTIONAL_SCORE_DATA, "Variant Functional Score"); + dataNamesMap.put(CADD_DATA, "CADD"); + dataNamesMap.put(MISSENSE_VARIATION_SCORE_DATA, "Missense Variation Score"); + dataNamesMap.put(REVEL_DATA, "Revel"); + dataNamesMap.put(CLINICAL_VARIANT_DATA, "Clinical Variant"); + dataNamesMap.put(CLINVAR_DATA, "ClinVar"); + dataNamesMap.put(COSMIC_DATA, "Cosmic"); + dataNamesMap.put(HGMD_DATA, "HGMD"); + dataNamesMap.put(GWAS_DATA, "GWAS Catalog"); + + // Populate data categories map + dataCategoriesMap.put(ENSEMBL_DATA, "Gene"); + dataCategoriesMap.put(REFSEQ_DATA, "Gene"); + dataCategoriesMap.put(GENOME_DATA, dataNamesMap.get(ENSEMBL_DATA)); + dataCategoriesMap.put(MANE_SELECT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(LRG_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HGNC_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(CANCER_HOTSPOT_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DGIDB_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_XREF_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GENE_EXPRESSION_ATLAS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(HPO_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(DISGENET_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GNOMAD_CONSTRAINTS_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(GO_ANNOTATION_DATA, dataNamesMap.get(GENE_ANNOTATION_DATA)); + dataCategoriesMap.put(UNIPROT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTERPRO_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(INTACT_DATA, dataNamesMap.get(PROTEIN_DATA)); + dataCategoriesMap.put(GERP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHASTCONS_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(PHYLOP_DATA, dataNamesMap.get(CONSERVATION_DATA)); + dataCategoriesMap.put(TRF_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(WM_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(GSD_DATA, dataNamesMap.get(REPEATS_DATA)); + dataCategoriesMap.put(REGULATORY_BUILD_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MOTIF_FEATURES_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(MIRTARBASE_DATA, dataNamesMap.get(REGULATION_DATA)); + dataCategoriesMap.put(HPO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(GO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(DOID_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(MONDO_OBO_DATA, dataNamesMap.get(ONTOLOGY_DATA)); + dataCategoriesMap.put(PUBMED_DATA, "Publication"); + dataCategoriesMap.put(PHARMGKB_DATA, dataNamesMap.get(PHARMACOGENOMICS_DATA)); + dataCategoriesMap.put(CADD_DATA, dataNamesMap.get(VARIATION_FUNCTIONAL_SCORE_DATA)); + dataCategoriesMap.put(REVEL_DATA, dataNamesMap.get(MISSENSE_VARIATION_SCORE_DATA)); + dataCategoriesMap.put(CLINVAR_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(COSMIC_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(HGMD_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + dataCategoriesMap.put(GWAS_DATA, dataNamesMap.get(CLINICAL_VARIANT_DATA)); + + // Populate data version filenames Map + dataVersionFilenamesMap.put(ENSEMBL_DATA, "ensemblCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REFSEQ_DATA, "refSeqCore" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENOME_DATA, "genome" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MANE_SELECT_DATA, "maneSelect" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(LRG_DATA, "lrg" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGNC_DATA, "hgnc" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CANCER_HOTSPOT_DATA, "cancerHotSpot" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DGIDB_DATA, "dgidb" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_XREF_DATA, "uniProtXref" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GENE_EXPRESSION_ATLAS_DATA, "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_DATA, "hpo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DISGENET_DATA, "disGeNet" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GNOMAD_CONSTRAINTS_DATA, "gnomadConstraints" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_ANNOTATION_DATA, "goAnnotation" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(UNIPROT_DATA, "uniProt" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTERPRO_DATA, "interPro" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(INTACT_DATA, "intAct" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GERP_DATA, "gerp" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHASTCONS_DATA, "phastCons" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHYLOP_DATA, "phyloP" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(TRF_DATA, "simpleRepeat" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(WM_DATA, "windowMasker" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GSD_DATA, "genomicSuperDups" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REGULATORY_BUILD_DATA, "regulatoryBuild" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MOTIF_FEATURES_DATA, "motifFeatures" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRBASE_DATA, "mirBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MIRTARBASE_DATA, "mirTarBase" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HPO_OBO_DATA, "hpoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GO_OBO_DATA, "goObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(DOID_OBO_DATA, "doidObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(MONDO_OBO_DATA, "mondoObo" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PUBMED_DATA, "pubMed" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(PHARMGKB_DATA, "pharmGKB" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CADD_DATA, "cadd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(REVEL_DATA, "revel" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(ALPHAMISSENSE_DATA, "alphaMissense" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(CLINVAR_DATA, "clinVar" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(COSMIC_DATA, "cosmic" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(HGMD_DATA, "hgmd" + SUFFIX_VERSION_FILENAME); + dataVersionFilenamesMap.put(GWAS_DATA, "gwas" + SUFFIX_VERSION_FILENAME); + } + + private EtlCommons() { + throw new IllegalStateException("Utility class"); + } public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) - throws IOException, InterruptedException { - // This small hack allow to configure the appropriate Logger level from the command line, this is done - // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created. -// org.apache.log4j.Logger rootLogger = LogManager.getRootLogger(); -// ConsoleAppender stderr = (ConsoleAppender) rootLogger.getAppender("stdout"); -// stderr.setThreshold(Level.toLevel("debug")); + throws IOException, InterruptedException, CellBaseException { Configurator.setRootLevel(Level.INFO); @@ -147,18 +515,21 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - logger.debug("Executing command: " + StringUtils.join(builder.command(), " ")); + if (logger.isDebugEnabled()) { + logger.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); + } Process process = builder.start(); process.waitFor(); // Check process output - boolean executedWithoutErrors = true; - int genomeInfoExitValue = process.exitValue(); - if (genomeInfoExitValue != 0) { - logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath); - executedWithoutErrors = false; + if (process.exitValue() != 0) { + String msg = "Error executing command '" + binPath + "'; args = " + args + ", error code = " + process.exitValue() + + ". More info in log file: " + logFilePath; + logger.error(msg); + throw new CellBaseException(msg); } - return executedWithoutErrors; + + return true; } private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) { @@ -203,7 +574,132 @@ public static Long countFileLines(Path filePath) throws IOException { } return nLines; } + } + + public static String getEnsemblUrl(DownloadProperties.EnsemblProperties props, String ensemblRelease, String fileId, String species, + String assembly, String chromosome) throws CellBaseException { + if (!props.getUrl().getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + String url = props.getUrl().getHost() + props.getUrl().getFiles().get(fileId); + + // Change release, species, assembly, chromosome if necessary + if (StringUtils.isNotEmpty(ensemblRelease)) { + url = url.replace(PUT_RELEASE_HERE_MARK, ensemblRelease.split("-")[1]); + } + if (StringUtils.isNotEmpty(species)) { + url = url.replace(PUT_SPECIES_HERE_MARK, species); + url = url.replace(PUT_CAPITAL_SPECIES_HERE_MARK, Character.toUpperCase(species.charAt(0)) + species.substring(1)); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + + public static String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + return getUrl(props, fileId, null, null, null); + } + + public static String getUrl(DownloadProperties.URLProperties props, String fileId, String species, String assembly, String chromosome) + throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + String url; + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + url = filesValue; + } else { + url = props.getHost() + filesValue; + } + if (StringUtils.isNotEmpty(species)) { + url = url.replace(PUT_SPECIES_HERE_MARK, species); + } + if (StringUtils.isNotEmpty(assembly)) { + url = url.replace(PUT_ASSEMBLY_HERE_MARK, assembly); + } + if (StringUtils.isNotEmpty(chromosome)) { + url = url.replace(PUT_CHROMOSOME_HERE_MARK, chromosome); + } + return url; + } + public static String getFilename(String prefix, String chromosome) { + return prefix + "_" + chromosome; } + public static boolean isExecutableAvailable(String executable) throws IOException, InterruptedException { + ProcessBuilder processBuilder = new ProcessBuilder("which", executable); + Process process = processBuilder.start(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) { + String line; + StringBuilder output = new StringBuilder(); + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + int exitCode = process.waitFor(); + + // if exitCode is 0 then the executable is installed at + output.toString().trim()), + // otherwise, it's not + return (exitCode == 0); + } + + public static String getFilenameFromProps(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException(getMissingFileIdMessage(fileId)); + } + return getFilenameFromUrl(props.getFiles().get(fileId)); + } + + public static String getFilenameFromUrl(String url) { + return Paths.get(url).getFileName().toString(); + } + + public static void checkDirectory(Path path, String name) throws CellBaseException { + if (path == null) { + throw new CellBaseException(name + " directory is null"); + } + if (!Files.exists(path)) { + throw new CellBaseException(name + " directory " + path + " does not exist"); + } + if (!Files.isDirectory(path)) { + throw new CellBaseException(name + " directory " + path + " is not a directory"); + } + } + + private static String getMissingFileIdMessage(String fileId) { + return "File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase configuration file"; + } + + public static String getDataName(String data) throws CellBaseException { + if (!dataNamesMap.containsKey(data)) { + throw new CellBaseException("Name not found for data '" + data + "'"); + } + return dataNamesMap.get(data); + } + + public static String getDataCategory(String data) throws CellBaseException { + if (!dataCategoriesMap.containsKey(data)) { + throw new CellBaseException("Category not found for data '" + data + "'"); + } + return dataCategoriesMap.get(data); + } + + public static String getDataVersionFilename(String data) throws CellBaseException { + if (!dataVersionFilenamesMap.containsKey(data)) { + throw new CellBaseException("Version filename not found for data '" + data + "'"); + } + return dataVersionFilenamesMap.get(data); + } + + public static List getUrls(List downloadFiles) { + return downloadFiles.stream().map(DownloadFile::getUrl).collect(Collectors.toList()); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java new file mode 100644 index 0000000000..475a91d315 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java @@ -0,0 +1,209 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; +import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.builders.utils.RocksDBUtils; +import org.opencb.commons.utils.FileUtils; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksIterator; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class AlphaMissenseBuilder extends CellBaseBuilder { + + private File alphaMissenseFile; + private CellBaseFileSerializer fileSerializer; + + private RocksDB rdb; + + private String AA_CHANGE_PATTERN = "^([A-Z])(\\d+)([A-Z])$"; + private Pattern aaChangePattern = Pattern.compile(AA_CHANGE_PATTERN); + + private static ObjectMapper mapper; + private static ObjectReader predictionReader; + private static ObjectWriter jsonObjectWriter; + + private static final String SOURCE = "alphamissense"; + + static { + mapper = new ObjectMapper(); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + predictionReader = mapper.readerFor(ProteinSubstitutionPrediction.class); + jsonObjectWriter = mapper.writer(); + } + + public AlphaMissenseBuilder(File alphaMissenseFile, CellBaseFileSerializer serializer) { + super(serializer); + + this.fileSerializer = serializer; + this.alphaMissenseFile = alphaMissenseFile; + + logger = LoggerFactory.getLogger(AlphaMissenseBuilder.class); + } + + @Override + public void parse() throws Exception { + logger.info("Parsing AlphaMissense file: {} ...", alphaMissenseFile.getName()); + + // Sanity check + FileUtils.checkFile(alphaMissenseFile.toPath()); + + Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("alphamissense-rdb.idx").toString(), true); + rdb = (RocksDB) dbConnection[0]; + Options dbOption = (Options) dbConnection[1]; + String dbLocation = (String) dbConnection[2]; + + // AlphaMissense file reader + BufferedReader br = FileUtils.newBufferedReader(alphaMissenseFile.toPath()); + String line; + int counter = 0; + while ((line = br.readLine()) != null) { + if (!line.startsWith("#")) { + // 0 1 2 3 4 5 6 7 8 9 + // CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class + String[] split = line.split("\t", -1); + + String chrom = null; + int position; + String reference; + String alternate = null; + String transcriptId; + String uniprotId; + int aaPosition; + String aaReference; + String aaAlternate; + + if (StringUtils.isNotEmpty(split[0])) { + chrom = split[0].replace("chr", ""); + } + if (StringUtils.isNotEmpty(split[1])) { + position = Integer.parseInt(split[1]); + } else { + logger.warn("Missing field 'position', skipping line: {}", line); + continue; + } + if (StringUtils.isNotEmpty(split[2])) { + reference = split[2]; + } else { + logger.warn("Missing field 'reference', skipping line: {}", line); + continue; + } + if (StringUtils.isNotEmpty(split[3])) { + alternate = split[3]; + } + if (StringUtils.isNotEmpty(split[6])) { + transcriptId = split[6].split("\\.")[0]; + } else { + logger.warn("Missing field 'transcript_id', skipping line: {}", line); + continue; + } + if (StringUtils.isNotEmpty(split[5])) { + uniprotId = split[5]; + } else { + logger.warn("Missing field 'uniprot_id', skipping line: {}", line); + continue; + } + if (StringUtils.isNotEmpty(split[7])) { + Matcher matcher = aaChangePattern.matcher(split[7]); + if (matcher.matches()) { + aaReference = matcher.group(1); + aaPosition = Integer.parseInt(matcher.group(2)); + aaAlternate = matcher.group(3); + } else { + logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line); + continue; + } + } else { + logger.warn("Missing field 'protein_variant', skipping line: {}", line); + continue; + } + + // Create protein substitution score + ProteinSubstitutionPredictionScore score = new ProteinSubstitutionPredictionScore(); + score.setAlternate(alternate); + score.setAaAlternate(aaAlternate); + if (StringUtils.isNotEmpty(split[8])) { + score.setScore(Double.parseDouble(split[8])); + } + if (StringUtils.isNotEmpty(split[9])) { + score.setEffect(split[9]); + } + + // Creating and/or updating protein substitution prediction + ProteinSubstitutionPrediction prediction; + String key = transcriptId + "_" + uniprotId + "_" + position + "_" + reference + "_" + aaPosition + "_" + aaReference; + byte[] dbContent = rdb.get(key.getBytes()); + if (dbContent == null) { + prediction = new ProteinSubstitutionPrediction(chrom, position, reference, transcriptId, uniprotId, aaPosition, + aaReference, SOURCE, null, Collections.singletonList(score)); + } else { + prediction = predictionReader.readValue(dbContent); + prediction.getScores().add(score); + } + rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(prediction)); + + // Log messages + counter++; + if (counter % 10000 == 0) { + logger.info("{} AlphaMissense predictions parsed", counter); + } + } + } + + // Serialize/write the saved variant polygenic scores in the RocksDB + serializeRDB(rdb); + RocksDBUtils.closeIndex(rdb, dbOption, dbLocation); + serializer.close(); + + logger.info("Parsed AlphaMissense file: {}. Done!", alphaMissenseFile.getName()); + } + + private void serializeRDB(RocksDB rdb) throws IOException { + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { +// logger.info("variant = {}", new String(rocksIterator.key())); + ProteinSubstitutionPrediction prediction = predictionReader.readValue(rocksIterator.value()); + serializer.serialize(prediction); + counter++; + if (counter % 10000 == 0) { + logger.info("{} written", counter); + } + } + serializer.close(); + logger.info("Done."); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index b593f44901..d0597c4c2a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -17,32 +17,33 @@ package org.opencb.cellbase.lib.builders; import org.opencb.biodata.models.core.GenomicScoreRegion; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 06/11/15. */ public class CaddScoreBuilder extends CellBaseBuilder { - private Path caddFilePath; + private Path caddDownloadPath; private static final int CHUNK_SIZE = 1000; private static final int DECIMAL_RESOLUTION = 100; - public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { + public CaddScoreBuilder(Path caddDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.caddFilePath = caddFilePath; - - logger = LoggerFactory.getLogger(ConservationBuilder.class); + this.caddDownloadPath = caddDownloadPath; } /* Example: @@ -57,14 +58,25 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { */ @Override public void parse() throws Exception { - FileUtils.checkPath(caddFilePath); + String dataName = getDataName(CADD_DATA); + String dataCategory = getDataCategory(CADD_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(caddDownloadPath, dataName); + + // Check ontology files + List caddFiles = checkFiles(dataSourceReader.readValue(caddDownloadPath.resolve(getDataVersionFilename(CADD_DATA)).toFile()), + caddDownloadPath, dataName); + if (caddFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + caddFiles.size() + " files"); + } - BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFilePath); List rawValues = new ArrayList<>(CHUNK_SIZE); List scaledValues = new ArrayList<>(CHUNK_SIZE); int start = 1; -// int end = 1999; int end = CHUNK_SIZE - 1; String line; String[] fields = new String[0]; @@ -72,8 +84,8 @@ public void parse() throws Exception { int lineCount = 0; int counter = 1; int serializedChunks = 0; - int previousPosition = 0; - int newPosition = 0; + int prevPos = 0; + int newPos = 0; String chromosome = null; String[] nucleotides = new String[]{"A", "C", "G", "T"}; @@ -81,127 +93,102 @@ public void parse() throws Exception { long scaledLongValue = 0; Map rawScoreValuesMap = new HashMap<>(); Map scaledScoreValuesMap = new HashMap<>(); - while ((line = bufferedReader.readLine()) != null) { - if (!line.startsWith("#")) { - fields = line.split("\t"); - newPosition = Integer.parseInt(fields[1]); -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.debug("line {} reached", line); -// logger.debug("Associated chunk count {}", serializedChunks); -// logger.debug("start {}", start); -// logger.debug("end {}", end); -// logger.debug("chunk size {}", CHUNK_SIZE); -// } - // this only happens the first time, when we start reading the file - if (chromosome == null) { - logger.info("Parsing chr {} ", fields[0]); - chromosome = fields[0]; - - start = newPosition; - previousPosition = newPosition; - end = start + CHUNK_SIZE - 2; - } - if (!chromosome.equals(fields[0])) { - logger.info("Parsing chr {} ", fields[0]); - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - chromosome = fields[0]; - start = newPosition; -// end = CHUNK_SIZE - 1; - end = start + CHUNK_SIZE - 2; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); -// rawLongValue = 0; -// lineCount = 0; -// rawScoreValuesMap.clear(); -// scaledScoreValuesMap.clear(); - // The series of cadd scores is not continuous through the whole chromosome - } else if (end < newPosition || (newPosition - previousPosition) > 1) { - // both raw and scaled are serialized - GenomicScoreRegion genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); - - genomicScoreRegion - = new GenomicScoreRegion<>(fields[0], start, previousPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); - - serializedChunks++; - start = newPosition; -// start = end + 1; -// end += CHUNK_SIZE; - end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; - - counter = 0; - rawValues.clear(); - scaledValues.clear(); - } + logger.info(PARSING_LOG_MESSAGE, caddFiles.get(0)); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(caddFiles.get(0).toPath())) { + while ((line = bufferedReader.readLine()) != null) { + if (!line.startsWith("#")) { + fields = line.split("\t"); + newPos = Integer.parseInt(fields[1]); + String message = "chrom. " + fields[0]; + // This only happens the first time, when we start reading the file + if (chromosome == null) { + logger.info(PARSING_LOG_MESSAGE, message); + chromosome = fields[0]; + + start = newPos; + prevPos = newPos; + end = start + CHUNK_SIZE - 2; + } - rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); - scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); - - if (++lineCount == 3) { -// if (fields[0].equals("1") && fields[1].equals("249240621")) { -// if (fields[0].equals("1") && fields[1].equals("69100")) { -// if (fields[0].equals("1") && fields[1].equals("144854598")) { -// logger.info("offset: {}", rawValues.size()); -// } - - for (String nucleotide : nucleotides) { - // raw CADD score values can be negative, we add 10 to make positive - float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; - v = (short) (a * DECIMAL_RESOLUTION); - rawLongValue = (rawLongValue << 16) | v; - - // scaled CADD scores are always positive - a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); - v = (short) (a * DECIMAL_RESOLUTION); - scaledLongValue = (scaledLongValue << 16) | v; + if (!chromosome.equals(fields[0])) { + logger.info(PARSING_LOG_MESSAGE, message); + + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(chromosome, start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + chromosome = fields[0]; + start = newPos; + end = start + CHUNK_SIZE - 2; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); + // The series of cadd scores is not continuous through the whole chromosome + } else if (end < newPos || (newPos - prevPos) > 1) { + // Both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_RAW_DATA, + rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, prevPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); + + serializedChunks++; + start = newPos; + end = (start / CHUNK_SIZE) * CHUNK_SIZE + CHUNK_SIZE - 1; + + counter = 0; + rawValues.clear(); + scaledValues.clear(); } -// if (rawLongValue < 0 || scaledLongValue < 0) { -// logger.error("raw/scaled Long Values cannot be 0"); -// logger.error("Last read line {}", line); -// System.exit(1); -// } - rawValues.add(rawLongValue); - scaledValues.add(scaledLongValue); - - counter++; - rawLongValue = 0; - lineCount = 0; - rawScoreValuesMap.clear(); - scaledScoreValuesMap.clear(); + rawScoreValuesMap.put(fields[3], Float.valueOf(fields[4])); + scaledScoreValuesMap.put(fields[3], Float.valueOf(fields[5])); + + if (++lineCount == 3) { + for (String nucleotide : nucleotides) { + // Raw CADD score values can be negative, we add 10 to make positive + float a = rawScoreValuesMap.getOrDefault(nucleotide, 10f) + 10.0f; + v = (short) (a * DECIMAL_RESOLUTION); + rawLongValue = (rawLongValue << 16) | v; + + // Scaled CADD scores are always positive + a = scaledScoreValuesMap.getOrDefault(nucleotide, 0f); + v = (short) (a * DECIMAL_RESOLUTION); + scaledLongValue = (scaledLongValue << 16) | v; + } + + rawValues.add(rawLongValue); + scaledValues.add(scaledLongValue); + + counter++; + rawLongValue = 0; + lineCount = 0; + rawScoreValuesMap.clear(); + scaledScoreValuesMap.clear(); + } + prevPos = newPos; } - previousPosition = newPosition; } - } - // Last chunks can be incomplete for both raw and scaled are serialized -// GenomicScoreRegion genomicScoreRegion = -// new GenomicScoreRegion<>(fields[0], start, start + rawValues.size() - 1, "cadd_raw", rawValues); - GenomicScoreRegion genomicScoreRegion = - new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_raw", rawValues); - serializer.serialize(genomicScoreRegion); + // Last chunks can be incomplete for both raw and scaled are serialized + GenomicScoreRegion genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_RAW_DATA, rawValues); + serializer.serialize(genomicScoreRegion); + + genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPos, CADD_SCALED_DATA, scaledValues); + serializer.serialize(genomicScoreRegion); -// genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, start + scaledValues.size() - 1, "cadd_scaled", scaledValues); - genomicScoreRegion = new GenomicScoreRegion<>(fields[0], start, newPosition, "cadd_scaled", scaledValues); - serializer.serialize(genomicScoreRegion); + serializer.close(); + } + logger.info(PARSING_DONE_LOG_MESSAGE, caddFiles.get(0)); - serializer.close(); - bufferedReader.close(); - logger.info("Parsing finished."); + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java index 79e5b7e58b..fe1b5fe648 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CellBaseBuilder.java @@ -16,34 +16,145 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 30/08/14. */ public abstract class CellBaseBuilder { protected CellBaseSerializer serializer; + protected ObjectReader dataSourceReader = new ObjectMapper().readerFor(DataSource.class); + + protected boolean checked; protected Logger logger; + public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ..."; + public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!"; + + public static final String BUILDING_LOG_MESSAGE = "Building {} ..."; + public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!"; + + public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ..."; + public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done!"; + + public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; + public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!"; + public CellBaseBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); this.serializer = serializer; - //this.serializer.open(); + this.checked = false; } public abstract void parse() throws Exception; public void disconnect() { - try { - serializer.close(); - } catch (Exception e) { - logger.error("Disconnecting serializer: " + e.getMessage()); + if (serializer != null) { + try { + serializer.close(); + } catch (Exception e) { + logger.error("Error closing serializer:\n" + StringUtils.join(e.getStackTrace(), "\n")); + } + } + } + + protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException { + logger.info("Checking file {}/{} ...", getDataName(data), fileId); + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'"); + } + if (!Files.exists(targetPath)) { + throw new CellBaseException("Folder does not exist " + targetPath); + } + + String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); + Path filePath = targetPath.resolve(filename); + if (!Files.exists(filePath)) { + throw new CellBaseException(getDataName(data) + " file " + filePath + " does not exist"); } + logger.info("Ok."); + return filePath.toFile(); } + protected List checkFiles(String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + return checkFiles(getDataName(data), data, downloadPath, expectedFiles); + } + + protected List checkFiles(String label, String data, Path downloadPath, int expectedFiles) throws CellBaseException, IOException { + List files = checkFiles(dataSourceReader.readValue(downloadPath.resolve(getDataVersionFilename(data)).toFile()), + downloadPath, label); + if (files.size() != expectedFiles) { + throw new CellBaseException(expectedFiles + " " + label + " files are expected at " + downloadPath + ", but currently there" + + " are " + files.size() + " files"); + } + return files; + } + + protected List checkFiles(DataSource dataSource, Path targetPath, String name) throws CellBaseException { + logger.info("Checking {} folder and files ...", name); + if (!targetPath.toFile().exists()) { + throw new CellBaseException(name + " folder does not exist " + targetPath); + } + + List files = new ArrayList<>(); + + List filenames = dataSource.getUrls().stream().map(u -> Paths.get(u).getFileName().toString()).collect(Collectors.toList()); + for (String filename : filenames) { + File file = targetPath.resolve(filename).toFile(); + if (!file.exists()) { + throw new CellBaseException("File " + file + " does not exits"); + } else { + files.add(file); + } + } + logger.info("Ok."); + return files; + } + + protected Path getIndexFastaReferenceGenome(Path fastaPath) throws CellBaseException { + Path indexFastaPath = Paths.get(fastaPath + FAI_EXTENSION); + if (!Files.exists(indexFastaPath)) { + // Index FASTA file + logger.info("Indexing FASTA file {} ...", fastaPath); + String errorMsg = "Error executing 'samtools faidx' for FASTA file "; + try { + List params = Arrays.asList("faidx", fastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "samtools", params, null); + } catch (IOException e) { + throw new CellBaseException(errorMsg + fastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException(errorMsg + fastaPath, e); + } + if (!Files.exists(indexFastaPath)) { + throw new CellBaseException("It could not index the FASTA file " + fastaPath + ". Please, try to do it manually!"); + } + } + return indexFastaPath; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9247b78faa..d43c38cb7a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -18,23 +18,24 @@ import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.MongoDBCollectionConfiguration; import org.opencb.commons.utils.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; -import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class ConservationBuilder extends CellBaseBuilder { - private Logger logger; private Path conservedRegionPath; private int chunkSize; @@ -50,326 +51,287 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile fileSerializer = serializer; this.conservedRegionPath = conservedRegionPath; this.chunkSize = chunkSize; - logger = LoggerFactory.getLogger(ConservationBuilder.class); outputFileNames = new HashMap<>(); } @Override public void parse() throws IOException, CellBaseException { - System.out.println("conservedRegionPath = " + conservedRegionPath.toString()); + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { - throw new IOException("Conservation directory does not exist, is not a directory or cannot be read"); + throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + + " be read"); } - /* - * GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse - * this file correctly, so we transform the file into a bedGraph format which is human readable. - */ - Path gerpFolderPath = conservedRegionPath.resolve(EtlCommons.GERP_SUBDIRECTORY); - if (gerpFolderPath.toFile().exists()) { - logger.debug("Parsing GERP data ..."); - gerpParser(gerpFolderPath); - } else { - logger.debug("GERP data not found: " + gerpFolderPath.toString()); + // Check GERP folder and files + Path gerpPath = conservedRegionPath.resolve(GERP_DATA); + DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); + + // Check PhastCons folder and files + Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); + + // Check PhyloP folder and files + Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); + dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); + + // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse + // this file correctly, so we transform the file into a bedGraph format which is human-readable. + if (gerpFiles.size() != 1) { + throw new CellBaseException("Only one " + getDataName(GERP_DATA) + " file is expected, but currently there are " + + gerpFiles.size() + " files"); } + File bigwigFile = gerpFiles.get(0); + File bedgraphFile = Paths.get(gerpFiles.get(0).getAbsolutePath() + ".bedgraph").toFile(); + String exec = "bigWigToBedGraph"; + if (!bedgraphFile.exists()) { + try { + if (isExecutableAvailable(exec)) { + EtlCommons.runCommandLineProcess(null, exec, Arrays.asList(bigwigFile.toString(), bedgraphFile.toString()), null); + } else { + throw new CellBaseException(exec + " not found in your system, install it to build " + getDataName(GERP_DATA) + + ". It is available at http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/"); + } + } catch (IOException e) { + throw new CellBaseException("Error executing " + exec + " in BIGWIG file " + bigwigFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("" + e.getMessage(), e); + } + if (!bedgraphFile.exists()) { + throw new CellBaseException("Something happened when executing " + exec + " in BIGWIG file " + bigwigFile + "; the BED" + + " graph file was not generated. Please, check " + exec); + } + } + gerpParser(bedgraphFile.toPath()); - /* - * UCSC phastCons and phylop are stored in the same format. They are processed together. - */ + // UCSC phastCons and phylop are stored in the same format. They are processed together. Map files = new HashMap<>(); String chromosome; Set chromosomes = new HashSet<>(); - // Reading all files in phastCons folder - DirectoryStream directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phastCons"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhastCons filenames + for (File file : phastConsFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phastCons", path); + files.put(chromosome + PHASTCONS_DATA, file.toPath()); } - // Reading all files in phylop folder - directoryStream = Files.newDirectoryStream(conservedRegionPath.resolve("phylop"), "*.wigFix.gz"); - for (Path path : directoryStream) { - chromosome = path.getFileName().toString().split("\\.")[0].replace("chr", ""); + // Process PhyloP filenames + for (File file : phylopFiles) { + chromosome = file.getName().split("\\.")[0].replace("chr", ""); chromosomes.add(chromosome); - files.put(chromosome + "phylop", path); + files.put(chromosome + PHYLOP_DATA, file.toPath()); } - /* - * Now we can iterate over all the chromosomes found and process the files - */ - logger.debug("Chromosomes found '{}'", chromosomes.toString()); + // Now we can iterate over all the chromosomes found and process the files + logger.debug("Chromosomes found '{}'", chromosomes); for (String chr : chromosomes) { - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phastCons")); - processWigFixFile(files.get(chr + "phastCons"), "phastCons"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHASTCONS_DATA)); + processWigFixFile(files.get(chr + PHASTCONS_DATA), PHASTCONS_DATA); - logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + "phylop")); - processWigFixFile(files.get(chr + "phylop"), "phylop"); + logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); + processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } - private void gerpParser(Path gerpFolderPath) throws IOException, CellBaseException { - Path gerpProcessFilePath = gerpFolderPath.resolve(EtlCommons.GERP_PROCESSED_FILE); - logger.info("parsing {}", gerpProcessFilePath); - BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath); - - String line; - int startOfBatch = 0; - int previousEndValue = 0; - String chromosome = null; - String previousChromosomeValue = null; - - List conservationScores = new ArrayList<>(chunkSize); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - - // file is wrong. throw an exception instead? - if (fields.length != 4) { - logger.error("skipping invalid line: " + line.length()); - continue; - } + private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, gerpProcessFilePath); - chromosome = fields[0]; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(gerpProcessFilePath)) { + String line; + int startOfBatch = 0; + int previousEndValue = 0; + String chromosome = null; + String previousChromosomeValue = null; - // new chromosome, store batch - if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { - storeScores(startOfBatch, previousChromosomeValue, conservationScores); + List conservationScores = new ArrayList<>(chunkSize); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); - // reset values for current batch - startOfBatch = 0; - } + // Checking line + if (fields.length != 4) { + throw new CellBaseException("Invalid " + getDataName(GERP_DATA) + " line (expecting 4 columns): " + fields.length + + " items: " + line); + } - // reset chromosome for next entry - previousChromosomeValue = chromosome; + chromosome = fields[0]; - // file is american! starts at zero, add one - int start = Integer.parseInt(fields[1]) + 1; - // inclusive - int end = Integer.parseInt(fields[2]) + 1; + // New chromosome, store batch + if (previousChromosomeValue != null && !previousChromosomeValue.equals(chromosome)) { + storeScores(startOfBatch, previousChromosomeValue, conservationScores); - // start coordinate for this batch of 2,000 - if (startOfBatch == 0) { - startOfBatch = start; - previousEndValue = 0; - } + // Reset values for current batch + startOfBatch = 0; + } - // if there is a gap between the last entry and this one. - if (previousEndValue != 0 && (start - previousEndValue) != 0) { - // gap is too big! store what we already have before processing more - if (start - previousEndValue >= chunkSize) { - // we have a full batch, store - storeScores(startOfBatch, chromosome, conservationScores); + // Reset chromosome for next entry + previousChromosomeValue = chromosome; - // reset batch to start at this record + // File is american! starts at zero, add one + int start = Integer.parseInt(fields[1]) + 1; + // Inclusive + int end = Integer.parseInt(fields[2]) + 1; + + // sSart coordinate for this batch of 2,000 + if (startOfBatch == 0) { startOfBatch = start; - } else { - // fill in the gap with zeroes - // don't overfill the batch - while (previousEndValue < start && conservationScores.size() < chunkSize) { - conservationScores.add((float) 0); - previousEndValue++; + previousEndValue = 0; + } + + // If there is a gap between the last entry and this one + if (previousEndValue != 0 && (start - previousEndValue) != 0) { + // Gap is too big! store what we already have before processing more + if (start - previousEndValue >= chunkSize) { + // We have a full batch, store + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset batch to start at this record + startOfBatch = start; + } else { + // Fill in the gap with zeroes, don't overfill the batch + while (previousEndValue < start && conservationScores.size() < chunkSize) { + conservationScores.add((float) 0); + previousEndValue++; + } + + // We have a full batch, store + if (conservationScores.size() == chunkSize) { + storeScores(startOfBatch, chromosome, conservationScores); + + // Reset: start a new batch + startOfBatch = start; + } } + } - // we have a full batch, store + // Reset value + previousEndValue = end; + + // Score for these coordinates + String score = fields[3]; + + // Add the score for each coordinate included in the range start-end + while (start < end) { + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch + // Reset: start a new batch startOfBatch = start; } - } - } - // reset value - previousEndValue = end; + // Add score to batch + conservationScores.add(Float.valueOf(score)); - // score for these coordinates - String score = fields[3]; + // Increment coordinate + start++; + } - // add the score for each coordinate included in the range start-end - while (start < end) { - // we have a full batch, store + // We have a full batch: store if (conservationScores.size() == chunkSize) { storeScores(startOfBatch, chromosome, conservationScores); - // reset. start a new batch - startOfBatch = start; + // Reset: start a new batch + startOfBatch = 0; } - - // add score to batch - conservationScores.add(Float.valueOf(score)); - - // increment coordinate - start++; } - - // we have a full batch, store - if (conservationScores.size() == chunkSize) { + // We need to serialize the last chunk that might be incomplete + if (!conservationScores.isEmpty()) { storeScores(startOfBatch, chromosome, conservationScores); - - // reset, start a new batch - startOfBatch = 0; } } - // we need to serialize the last chunk that might be incomplete - if (!conservationScores.isEmpty()) { - storeScores(startOfBatch, chromosome, conservationScores); - } - bufferedReader.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gerpProcessFilePath); } private void storeScores(int startOfBatch, String chromosome, List conservationScores) throws CellBaseException { - // if this is a small batch, fill in the missing coordinates with 0 + // If this is a small batch, fill in the missing coordinates with 0 while (conservationScores.size() < chunkSize) { conservationScores.add((float) 0); } if (conservationScores.size() != chunkSize) { - throw new CellBaseException("invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); + throw new CellBaseException("Invalid chunk size " + conservationScores.size() + " for " + chromosome + ":" + startOfBatch); } - GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion(chromosome, startOfBatch, - startOfBatch + conservationScores.size() - 1, "gerp", conservationScores); + GenomicScoreRegion conservationScoreRegion = new GenomicScoreRegion<>(chromosome, startOfBatch, + startOfBatch + conservationScores.size() - 1, GERP_DATA, conservationScores); fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome)); - // reset + // Reset conservationScores.clear(); } -// @Deprecated -// private void gerpParser(Path gerpFolderPath) throws IOException, InterruptedException { -// logger.info("Uncompressing {}", gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// List tarArgs = Arrays.asList("-xvzf", gerpFolderPath.resolve(EtlCommons.GERP_FILE).toString(), -// "--overwrite", "-C", gerpFolderPath.toString()); -// EtlCommons.runCommandLineProcess(null, "tar", tarArgs, null); -// -// DirectoryStream pathDirectoryStream = Files.newDirectoryStream(gerpFolderPath, "*.rates"); -// boolean filesFound = false; -// for (Path path : pathDirectoryStream) { -// filesFound = true; -// logger.info("Processing file '{}'", path.getFileName().toString()); -// String[] chromosome = path.getFileName().toString().replaceFirst("chr", "").split("\\."); -// BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(String.valueOf(path)))); -// String line; -// int start = 1; -// int end = 1999; -// int counter = 1; -// String[] fields; -// List val = new ArrayList<>(chunkSize); -// while ((line = bufferedReader.readLine()) != null) { -// fields = line.split("\t"); -// val.add(Float.valueOf(fields[1])); -// counter++; -// if (counter == chunkSize) { -//// ConservationScoreRegion conservationScoreRegion = new ConservationScoreRegion(chromosome[0], start, end, "gerp", -// val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, end, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// start = end + 1; -// end += chunkSize; -// -// counter = 0; -// val.clear(); -// } -// } -// -// // we need to serialize the last chunk that might be incomplete -//// ConservationScoreRegion conservationScoreRegion = -//// new ConservationScoreRegion(chromosome[0], start, start + val.size() - 1, "gerp", val); -// GenomicScoreRegion conservationScoreRegion = -// new GenomicScoreRegion<>(chromosome[0], start, start + val.size() - 1, "gerp", val); -// fileSerializer.serialize(conservationScoreRegion, getOutputFileName(chromosome[0])); -// -// bufferedReader.close(); -// } -// -// if (!filesFound) { -// logger.warn("No GERP++ files were found. Please check that the original file {} is there, that it was" -// + " properly decompressed and that the *.rates files are present", -// gerpFolderPath.resolve(EtlCommons.GERP_FILE)); -// } -// } - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { - BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath); - - String line; - String chromosome = ""; -// int start = 0, end = 0; - int start = 0; - float value; - Map attributes = new HashMap<>(); -// ConservedRegion conservedRegion = null; - List values = new ArrayList<>(); -// ConservationScoreRegion conservedRegion = null; - GenomicScoreRegion conservedRegion = null; - - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("fixedStep")) { - //new group, save last - if (conservedRegion != null) { -// conservedRegion.setEnd(end); -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - } + logger.info(PARSING_LOG_MESSAGE, inGzPath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { + + String line; + String chromosome = ""; + int start = 0; + float value; + Map attributes = new HashMap<>(); + List values = new ArrayList<>(); + GenomicScoreRegion conservedRegion = null; + + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("fixedStep")) { + // New group, save last + if (conservedRegion != null) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, + conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } -// offset = 0; - attributes.clear(); - String[] attrFields = line.split(" "); - String[] attrKeyValue; - for (String attrField : attrFields) { - if (!attrField.equalsIgnoreCase("fixedStep")) { - attrKeyValue = attrField.split("="); - attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + attributes.clear(); + String[] attrFields = line.split(" "); + String[] attrKeyValue; + for (String attrField : attrFields) { + if (!attrField.equalsIgnoreCase("fixedStep")) { + attrKeyValue = attrField.split("="); + attributes.put(attrKeyValue[0].toLowerCase(), attrKeyValue[1]); + } } - } - chromosome = formatChromosome(attributes); - start = Integer.parseInt(attributes.get("start")); -// end = Integer.parseInt(attributes.get("start")); - - values = new ArrayList<>(2000); - } else { - int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; -// end++; - int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; - // This is the endChunk if current read score is - // appended to the array (otherwise it would be - // start + values.size() - 1). If this endChunk is - // different from the startChunk means that current - // conserved region must be dumped and current - // score must be associated to next chunk. Main - // difference to what there was before is that if - // the fixedStep starts on the last position of a - // chunk e.g. 1999, the chunk must be created with - // just that score - the chunk was left empty with - // the old code - if (startChunk != endChunk) { -// conservedRegion = new ConservationScoreRegion(chromosome, start, end - 1, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, - conservationSource, values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - start = start + values.size(); - values.clear(); - } + chromosome = formatChromosome(attributes); + start = Integer.parseInt(attributes.get("start")); - value = Float.parseFloat(line.trim()); - values.add(value); + values = new ArrayList<>(2000); + } else { + int startChunk = start / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + int endChunk = (start + values.size()) / MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; + // This is the endChunk if current read score is appended to the array (otherwise it would be start + values.size() + // - 1). If this endChunk is different from the startChunk means that current conserved region must be dumped and + // current score must be associated to next chunk. Main difference to what there was before is that if the fixedStep + // starts on the last position of a chunk e.g. 1999, the chunk must be created with just that score - the chunk was + // left empty with the old code + if (startChunk != endChunk) { + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, + values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + start = start + values.size(); + values.clear(); + } + + value = Float.parseFloat(line.trim()); + values.add(value); + } } + + // Write last + conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); + fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); } - //write last -// conservedRegion = new ConservationScoreRegion(chromosome, start, end, conservationSource, values); - conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, - values); - fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); - bufferedReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } private String getOutputFileName(String chromosome) { @@ -379,13 +341,18 @@ private String getOutputFileName(String chromosome) { } String outputFileName = outputFileNames.get(chromosome); if (outputFileName == null) { - outputFileName = "conservation_" + chromosome; + outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); } return outputFileName; } - // phylop and phastcons list the chromosome as M instead of the standard MT. replace. + /** + * Remove chr from the chromosome name; and phylop and phastcons list the chromosome as M instead of the standard MT, replace it. + * + * @param attributes Attributes map with the chromosome name + * @return The new chromosome name + */ private String formatChromosome(Map attributes) { String chromosome = attributes.get("chrom").replace("chr", ""); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java new file mode 100644 index 0000000000..d6b935fa52 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -0,0 +1,956 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import htsjdk.tribble.readers.TabixReader; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.feature.gff.Gff2; +import org.opencb.biodata.formats.feature.gtf.Gtf; +import org.opencb.biodata.formats.feature.gtf.io.GtfReader; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.tools.sequence.FastaIndex; +import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.SpeciesConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class EnsemblGeneBuilder extends CellBaseBuilder { + + private Path downloadPath; + private SpeciesConfiguration speciesConfiguration; + private boolean flexibleGTFParsing; + private CellBaseConfiguration configuration; + + private Map transcriptDict; + private Map exonDict; + + private Path gtfFile; + private Path proteinFastaFile; + private Path cDnaFastaFile; + private Path geneDescriptionFile; + private Path xrefsFile; + private Path hgncFile; + private Path maneFile; + private Path lrgFile; + private Path uniprotIdMappingFile; + private Path tfbsFile; + private Path tabixFile; + private Path geneExpressionFile; + private Path geneDrugFile; + private Path hpoFile; + private Path disgenetFile; + private Path genomeSequenceFilePath; + private Path gnomadFile; + private Path geneOntologyAnnotationFile; + private Path miRBaseFile; + private Path miRTarBaseFile; + private Path cancerGeneCensusFile; + private Path cancerHostpotFile; + private Path ensemblCanonicalFile; + private Path tso500File; + private Path eglhHaemOncFile; + + // source for genes is either ensembl or refseq + private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); + + private int geneCounter; + private ArrayList geneList; + private String geneName; + private int transcriptCounter; + private ArrayList transcriptList; + private String transcriptName; + private int exonCounter; + private String feature; + private Gtf nextGtfToReturn; + + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, + CellBaseSerializer serializer) { + super(serializer); + + this.downloadPath = downloadPath; + this.speciesConfiguration = speciesConfiguration; + this.flexibleGTFParsing = flexibleGTFParsing; + + transcriptDict = new HashMap<>(250000); + exonDict = new HashMap<>(8000000); + } + + public void check() throws Exception { + if (checked) { + return; + } + + String ensemblGeneLabel = getDataName(ENSEMBL_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + + // Sanity check + checkDirectory(downloadPath, ensemblGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check Ensembl files + List files = checkFiles(ensemblGeneLabel, ENSEMBL_DATA, downloadPath, 3); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains(".pep.all.fa")).findFirst().get().toPath(); + cDnaFastaFile = files.stream().filter(f -> f.getName().contains(".cdna.all.fa")).findFirst().get().toPath(); + + // Check common files + // geneDescriptionFile = + // xrefsFile = + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // ensemblCanonicalFile = ; + // cancerGeneCensus = + // tso500File = + // eglhHaemOncFile = + + // Check regulation files + // Motif features + files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 2); + if (files.get(0).getName().endsWith("tbi")) { + tabixFile = files.get(0).toPath(); + tfbsFile = files.get(1).toPath(); + } else { + tabixFile = files.get(1).toPath(); + tfbsFile = files.get(0).toPath(); + } + // mirbase + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + // Check genome fasta file + genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath(); + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); + checked = true; + } + + public void parse() throws Exception { + check(); + + Gene gene = null; + Transcript transcript; + Exon exon = null; + int cdna = 1; + int cds = 1; + + EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); + + try { + // process files and put values in rocksdb + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, + proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, + geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, + miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, + tso500File, eglhHaemOncFile); + + TabixReader tabixReader = null; + if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { + logger.error("Tfbs or tabix file not found. Download them and try again."); + } else { + tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); + } + + // Preparing the fasta file for fast accessing +// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); + FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); + + // Empty transcript and exon dictionaries + transcriptDict.clear(); + exonDict.clear(); + + logger.info(PARSING_LOG_MESSAGE, gtfFile); + GtfReader gtfReader = new GtfReader(gtfFile); + + // Gene->Transcript->Feature->GTF line + Map>> gtfMap = null; + if (flexibleGTFParsing) { + gtfMap = loadGTFMap(gtfReader); + initializePointers(gtfMap); + } + + Gtf gtf; + while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { + + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + String geneId = gtf.getAttributes().get("gene_id"); + String transcriptId = gtf.getAttributes().get("transcript_id"); + String geneName = gtf.getAttributes().get("gene_name"); + if (newGene(gene, geneId)) { + // If new geneId is different from the current then we must serialize before data new gene + if (gene != null) { + serializer.serialize(gene); + } + + GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), + indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), + indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); + + gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), + gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), + new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); + } + + // Check if Transcript exist in the Gene Set of transcripts + if (!transcriptDict.containsKey(transcriptId)) { + transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); + } else { + transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); + } + + // At this point gene and transcript objects are set up + // Update gene and transcript genomic coordinates, start must be the + // lower, and end the higher + updateTranscriptAndGeneCoords(transcript, gene, gtf); + + String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; + if (gtf.getFeature().equalsIgnoreCase("exon")) { + // Obtaining the exon sequence + String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); + String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); + + exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf + .getAttributes().get("exon_number")), exonSequence); + transcript.getExons().add(exon); + + exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); + if (gtf.getAttributes().get("exon_number").equals("1")) { + cdna = 1; + cds = 1; + } else { + // with every exon we update cDNA length with the previous exon length + cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() + - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; + } + } else { + exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); + if (gtf.getFeature().equalsIgnoreCase("CDS")) { + // Protein ID is only present in CDS lines + String proteinId = gtf.getAttributes().get("protein_id") != null + ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") + : ""; + transcript.setProteinId(proteinId); + transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); + + if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + + // cDNA coordinates + exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); + } + // strand - + } else { + // CDS states the beginning of coding start + exon.setGenomicCodingStart(gtf.getStart()); + exon.setGenomicCodingEnd(gtf.getEnd()); + // cDNA coordinates + // cdnaCodingStart points to the same base position than genomicCodingEnd + exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + // Set cdnaCodingEnd to prevent those cases without stop_codon + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsStart(cds); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // increment in the coding length + cds += gtf.getEnd() - gtf.getStart() + 1; + transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon + exon.setPhase(Integer.parseInt(gtf.getFrame())); + + if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { + transcript.setGenomicCodingStart(gtf.getStart()); + } + if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { + transcript.setGenomicCodingEnd(gtf.getEnd()); + } + // only first time + if (transcript.getCdnaCodingStart() == 0) { + // cdnaCodingStart points to the same base position than genomicCodingEnd + transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); + } + } + + } +// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { +// // nothing to do +// System.out.println("Empty block, this should be redesigned"); +// } + if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { + // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, + // no need to set it at the beginning of next feature + if (exon.getStrand().equals("+")) { + updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingEnd(gtf.getEnd()); + transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + transcript.setCdsLength(cds - 1); + + } else { + updateNegativeExonCodingData(exon, cdna, cds, gtf); + + cds += gtf.getEnd() - gtf.getStart(); + // If stop_codon appears, overwrite values + transcript.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + transcript.setCdsLength(cds - 1); + } + } + } + } + + // last gene must be serialized + serializer.serialize(gene); + + // Close + gtfReader.close(); + serializer.close(); + fastaIndex.close(); + indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); + } catch (Exception e) { + indexer.close(); + throw e; + } + } + + private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) + throws IOException, RocksDBException { + Map gtfAttributes = gtf.getAttributes(); + + // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. + String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); + String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; + String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); + List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); + + List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); + TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); + + Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, + gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", + 0, 0, 0, 0, 0, + indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", + gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, + new HashSet<>(), transcriptAnnotation); + + // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL + // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure + // that the xrefs array contains all ids present in the GTF file + addGtfXrefs(transcript, gene, gtfAttributes); + + // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID + String hgncId = indexer.getHgncId(gene.getName()); + if (StringUtils.isNotEmpty(hgncId)) { + transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); + } + + // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE + for (String suffix: Arrays.asList("refseq", "refseq_protein")) { + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); + if (StringUtils.isNotEmpty(maneRefSeq)) { + transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, + "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); + } + } + + // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG + String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(lrgRefSeq)) { + transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); + } + + // Add Flags + // 1. GTF tags + String tags = gtf.getAttributes().get("tag"); + if (StringUtils.isNotEmpty(tags)) { + transcript.getFlags().addAll(Arrays.asList(tags.split(","))); + } + // 2. TSL + String supportLevel = gtfAttributes.get("transcript_support_level"); + if (StringUtils.isNotEmpty(supportLevel)) { + // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" + String truncatedSupportLevel = supportLevel.split(" ")[0]; + transcript.getFlags().add("TSL:" + truncatedSupportLevel); + } + // 3. MANE Flag + String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); + if (StringUtils.isNotEmpty(maneFlag)) { + transcript.getFlags().add(maneFlag); + } + // 4. LRG Flag + String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); + if (StringUtils.isNotEmpty(lrg)) { + transcript.getFlags().add("LRG"); + } else { + for (Xref xref : transcript.getXrefs()) { + if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { + transcript.getFlags().add("LRG"); + } + } + } + // 5. Ensembl Canonical + String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); + if (StringUtils.isNotEmpty(canonicalFlag)) { + transcript.getFlags().add(canonicalFlag); + } + + // 6. TSO500 and EGLH HaemOnc + String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); + if (StringUtils.isNotEmpty(maneRefSeq)) { + String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(tso500Flag)) { + transcript.getFlags().add(tso500Flag); + } + + String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); + if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { + transcript.getFlags().add(eglhHaemOncFlag); + } + } + + gene.getTranscripts().add(transcript); + + // Do not change order!! size()-1 is the index of the transcript ID + transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); + return transcript; + } + + private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) + throws IOException, RocksDBException { + if (xrefs == null || indexer == null) { + return null; + } + List annotations = new ArrayList<>(); + for (Xref xref : xrefs) { + if (xref.getDbName().equals("uniprotkb_acc")) { + String key = xref.getId(); + if (key != null && indexer.getOntologyAnnotations(key) != null) { + annotations.addAll(indexer.getOntologyAnnotations(key)); + } + } + } + return annotations; + } + + private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingStart(gtf.getStart()); + // cdnaCodingEnd points to the same base position than genomicCodingStart + exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingEnd() == 0) { + exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { + // we need to increment 3 nts, the stop_codon length. + exon.setGenomicCodingEnd(gtf.getEnd()); + exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); + exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + + // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined + // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding + // starts + if (exon.getGenomicCodingStart() == 0) { + exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); + } + if (exon.getCdnaCodingStart() == 0) { + exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); + } + if (exon.getCdsStart() == 0) { + exon.setCdsStart(exon.getCdsEnd() - 2); + } + } + + private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { + if (transcript.getXrefs() == null) { + transcript.setXrefs(new ArrayList<>()); + } + + transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); + transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); + + // Some non-coding genes do not have Gene names + if (StringUtils.isNotEmpty(gene.getName())) { + transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); + transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); + } + + if (gtfAttributes.get("ccds_id") != null) { + transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); + } + } + + private void initializePointers(Map>> gtfMap) { + geneCounter = 0; + geneList = new ArrayList<>(gtfMap.keySet()); + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + + private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { + // Flexible parsing is deactivated, return next line + if (gtfMap == null) { + return gtfReader.read(); + // Flexible parsing activated, carefully select next line to return + } else { + // No more genes/features to return + if (nextGtfToReturn == null) { + return null; + } + Gtf gtfToReturn = nextGtfToReturn; + if (feature.equals("exon")) { +// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { + nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) + .get(transcriptName).get("exon")).get(exonCounter)).getStart(), + ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), + (List) gtfMap.get(geneName).get(transcriptName).get("cds")); + if (nextGtfToReturn != null) { + feature = "cds"; + return gtfToReturn; + } + } + // if no cds was found for this exon, get next exon + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("cds") || feature.equals("stop_codon")) { + getFeatureFollowsExon(gtfMap); + return gtfToReturn; + } + if (feature.equals("start_codon")) { + feature = "stop_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); + return gtfToReturn; + } + // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon + throw new FileFormatException("Execution cannot reach this point"); + } + } + + private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { + for (Object cdsObject : cdsList) { + int cdsStart = ((Gtf) cdsObject).getStart(); + int cdsEnd = ((Gtf) cdsObject).getEnd(); + if (cdsStart <= exonEnd && cdsEnd >= exonStart) { + return (Gtf) cdsObject; + } + } + return null; + } + + private void getFeatureFollowsExon(Map>> gtfMap) { + exonCounter++; + if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() + || feature.equals("stop_codon")) { + // If last returned feature was a stop_codon or no start_codon is provided for this transcript, + // next transcript must be selected + if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { + feature = "start_codon"; + nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); + } else { + transcriptCounter++; + // No more transcripts in this gene, check if there are more genes + if (transcriptCounter == gtfMap.get(geneName).size()) { + geneCounter++; + // No more genes available, end parsing + if (geneCounter == gtfMap.size()) { + nextGtfToReturn = null; + feature = null; + // Still more genes to parse, select next one + } else { + geneName = geneList.get(geneCounter); + transcriptCounter = 0; + transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); + } + } + // Check if a new gene was selected - null would indicate there're no more genes + if (nextGtfToReturn != null) { + transcriptName = transcriptList.get(transcriptCounter); + exonCounter = 0; + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + } else { + feature = "exon"; + nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); + } + } + + private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { + Map>> gtfMap = new HashMap<>(); + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") + || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { + continue; + } + + // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene + String geneId = gtf.getAttributes().get("gene_id"); + // Transcript -> feature -> GTF line + Map> gtfMapGeneEntry; + if (gtfMap.containsKey(geneId)) { + gtfMapGeneEntry = gtfMap.get(geneId); + } else { + gtfMapGeneEntry = new HashMap(); + gtfMap.put(geneId, gtfMapGeneEntry); + } + + // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene + String transcriptId = gtf.getAttributes().get("transcript_id"); + Map gtfMapTranscriptEntry; + if (gtfMapGeneEntry.containsKey(transcriptId)) { + gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); + } else { + gtfMapTranscriptEntry = new HashMap(); + gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); + } + + addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); + + } + + // Exon number is mandatory for the parser to be able to properly generate the gene data model + if (!exonNumberPresent(gtfMap)) { + setExonNumber(gtfMap); + } + + return gtfMap; + } + + private boolean exonNumberPresent(Map>> gtfMap) { + Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); + return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) + .getAttributes().containsKey("exon_number"); + } + + private void setExonNumber(Map>> gtfMap) { + for (String gene : gtfMap.keySet()) { + for (String transcript : gtfMap.get(gene).keySet()) { + List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); + Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); + if (exonList.get(0).getStrand().equals("+")) { + int exonNumber = 1; + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber++; + } + } else { + int exonNumber = exonList.size(); + for (Gtf gtf : exonList) { + gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); + exonNumber--; + } + } + } + } + } + + private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { + // Add exon/cds GTF line to the corresponding gene entry in the map + String featureType = gtf.getFeature().toLowerCase(); + if (featureType.equals("exon") || featureType.equals("cds")) { + List gtfList; + // Check if there were exons already stored + if (gtfMapTranscriptEntry.containsKey(featureType)) { + gtfList = (List) gtfMapTranscriptEntry.get(featureType); + } else { + gtfList = new ArrayList<>(); + gtfMapTranscriptEntry.put(featureType, gtfList); + } + gtfList.add(gtf); + // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" + // keys are already there + } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { + gtfMapTranscriptEntry.put(featureType, gtf); + } + } + + private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { + if (tabixReader == null) { + return null; + } + List transcriptTfbses = null; + + int transcriptStart = transcript.getStart(); + int transcriptEnd = transcript.getEnd(); + + + String line; + TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); + while ((line = iter.next()) != null) { + String[] elements = line.split("\t"); + + String sequenceName = elements[0]; + String source = elements[1]; + String feature = elements[2]; + int start = Integer.parseInt(elements[3]); + int end = Integer.parseInt(elements[4]); + String score = elements[5]; + String strand = elements[6]; + String frame = elements[7]; + String attribute = elements[8]; + + if (strand.equals(transcript.getStrand())) { + continue; + } + + if (transcript.getStrand().equals("+")) { + if (start > transcript.getStart() + 500) { + break; + } else if (end > transcript.getStart() - 2500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } else { + // transcript in negative strand + if (start > transcript.getEnd() + 2500) { + break; + } else if (start > transcript.getEnd() - 500) { + Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); + transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); + } + } + } + + return transcriptTfbses; + } + + protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, + List transcriptTfbses) { + if (transcriptTfbses == null) { + transcriptTfbses = new ArrayList<>(); + } + + // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; + // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB + String[] attributes = tfbs.getAttribute().split(";"); + + String id = null; + String pfmId = null; + List transciptionFactors = null; + + for (String attributePair : attributes) { + String[] attributePairArray = attributePair.split("="); + switch(attributePairArray[0]) { + case "binding_matrix_stable_id": + pfmId = attributePairArray[1]; + break; + case "stable_id": + id = attributePairArray[1]; + break; + case "transcription_factor_complex": + transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); + break; + default: + break; + } + } + + transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), + tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), + Float.parseFloat(tfbs.getScore()))); + return transcriptTfbses; + } + + private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { + Integer relativeStart; + if (transcript.getStrand().equals("+")) { + if (tfbs.getStart() < transcript.getStart()) { + relativeStart = tfbs.getStart() - transcript.getStart(); + } else { + relativeStart = tfbs.getStart() - transcript.getStart() + 1; + } + } else { + // negative strand transcript + if (tfbs.getEnd() > transcript.getEnd()) { + relativeStart = transcript.getEnd() - tfbs.getEnd(); + } else { + relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; + } + } + return relativeStart; + } + + private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { + Integer relativeEnd; + if (transcript.getStrand().equals("+")) { + if (tfbs.getEnd() < transcript.getStart()) { + relativeEnd = tfbs.getEnd() - transcript.getStart(); + } else { + relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; + } + } else { + if (tfbs.getStart() > transcript.getEnd()) { + relativeEnd = transcript.getEnd() - tfbs.getStart(); + } else { + relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; + } + } + return relativeEnd; + } + + + + private boolean newGene(Gene previousGene, String newGeneId) { + return previousGene == null || !newGeneId.equals(previousGene.getId()); + } + + private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { + if (transcript.getStart() > gtf.getStart()) { + transcript.setStart(gtf.getStart()); + } + if (transcript.getEnd() < gtf.getEnd()) { + transcript.setEnd(gtf.getEnd()); + } + if (gene.getStart() > gtf.getStart()) { + gene.setStart(gtf.getStart()); + } + if (gene.getEnd() < gtf.getEnd()) { + gene.setEnd(gtf.getEnd()); + } + } + + private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { + gtfFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { + proteinFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } + + private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { + for (String fileName : geneDirectoryPath.toFile().list()) { + if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { + cDnaFastaFile = geneDirectoryPath.resolve(fileName); + break; + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index fb67c19b8b..10f54e2ea1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -16,27 +16,44 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.commons.lang3.StringUtils; -import org.apache.poi.hssf.usermodel.HSSFSheet; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParser; +import org.opencb.biodata.formats.feature.mirbase.MirBaseParserCallback; import org.opencb.biodata.formats.gaf.GafParser; import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.core.FeatureOntologyTermAnnotation; +import org.opencb.biodata.models.core.MiRnaGene; +import org.opencb.biodata.models.core.MirnaTarget; import org.opencb.biodata.models.core.Xref; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.models.variant.avro.*; +import org.opencb.biodata.models.variant.avro.Constraint; +import org.opencb.biodata.models.variant.avro.Expression; +import org.opencb.biodata.models.variant.avro.ExpressionCall; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; +import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.zip.GZIPInputStream; -public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + +public class EnsemblGeneBuilderIndexer extends GeneBuilderIndexer { private static final String DESCRIPTION_SUFFIX = "_description"; private static final String XREF_SUFFIX = "_xref"; @@ -56,12 +73,12 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile) - throws IOException, RocksDBException, FileFormatException { - indexDescriptions(geneDescriptionFile); - indexXrefs(xrefsFile, uniprotIdMappingFile); + throws IOException, RocksDBException, FileFormatException, CellBaseException { +// indexDescriptions(geneDescriptionFile); +// indexXrefs(xrefsFile, uniprotIdMappingFile); indexHgncIdMapping(hgncFile); - indexManeMapping(maneFile, "ensembl"); - indexLrgMapping(lrgFile, "ensembl"); + indexManeMapping(maneFile, ENSEMBL_DATA); + indexLrgMapping(lrgFile, ENSEMBL_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); @@ -69,13 +86,13 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexDiseases(hpoFile, disgenetFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); - indexMiRBase(miRBaseFile); + indexMiRBase(species, miRBaseFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensusFile); +// indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); - indexCanonical(canonicalFile); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); +// indexCanonical(canonicalFile); +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { @@ -233,129 +250,6 @@ public List getExpression(String id) throws RocksDBException, IOExce return rocksDbManager.getExpression(rocksdb, key); } - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - String line; - - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - } - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Collections.singletonList(numberOfSNPs), Collections.singletonList(source), - "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - } - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); @@ -384,7 +278,7 @@ private void indexConstraints(Path gnomadFile) throws IOException, RocksDBExcept rocksDbManager.update(rocksdb, transcriptIdentifier + CONSTRAINT_SUFFIX, constraints); if ("TRUE".equalsIgnoreCase(canonical)) { - rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); + rocksDbManager.update(rocksdb, geneIdentifier + CONSTRAINT_SUFFIX, constraints); } } br.close(); @@ -432,66 +326,13 @@ public List getOntologyAnnotations(String id) thr return rocksDbManager.getOntologyAnnotations(rocksdb, key); } - private void indexMiRBase(Path miRBaseFile) throws IOException, RocksDBException { - if (miRBaseFile != null && Files.exists(miRBaseFile) && Files.size(miRBaseFile) > 0) { - logger.info("Loading mirna from '{}'", miRBaseFile); - FileInputStream fileInputStream = new FileInputStream(miRBaseFile.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - org.apache.poi.ss.usermodel.Cell cell = cellIterator.next(); - String miRBaseAccession = cell.getStringCellValue(); - - cell = cellIterator.next(); - String miRBaseID = cell.getStringCellValue(); - - cell = cellIterator.next(); - String status = cell.getStringCellValue(); - - cell = cellIterator.next(); - String sequence = cell.getStringCellValue(); + private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); - cell = cellIterator.next(); - String mature1Accession = cell.getStringCellValue(); + MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); + MirBaseParser.parse(miRBaseFile, species, callback); - cell = cellIterator.next(); - String mature1Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - String mature1Sequence = cell.getStringCellValue(); - - String mature2Accession = ""; - String mature2Id = ""; - String mature2Sequence = ""; - if (cellIterator.hasNext()) { - cell = cellIterator.next(); - mature2Accession = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Id = cell.getStringCellValue(); - - cell = cellIterator.next(); - mature2Sequence = cell.getStringCellValue(); - } - - MiRnaGene miRNAGene = new MiRnaGene(miRBaseAccession, miRBaseID, status, sequence, new ArrayList<>()); - int cdnaStart = sequence.indexOf(mature1Sequence); - int cdnaEnd = cdnaStart + mature1Sequence.length(); - miRNAGene.addMiRNAMature(mature1Accession, mature1Id, mature1Sequence, cdnaStart, cdnaEnd); - - cdnaStart = sequence.indexOf(mature2Sequence); - cdnaEnd = cdnaStart + mature2Sequence.length(); - miRNAGene.addMiRNAMature(mature2Accession, mature2Id, mature2Sequence, cdnaStart, cdnaEnd); - - rocksDbManager.update(rocksdb, miRBaseID + MIRBASE_SUFFIX, miRNAGene); - } - } else { - logger.error("mirna file not found"); - } + logger.info(PARSING_DONE_LOG_MESSAGE, miRBaseFile); } public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOException { @@ -509,117 +350,11 @@ public MiRnaGene getMirnaGene(String transcriptId) throws RocksDBException, IOEx return null; } - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList<>(); - Map> geneToMirna = new HashMap<>(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - - Iterator cellIterator = currentRow.iterator(); - Cell cell = cellIterator.next(); - - // Iterate columns - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // Skip species - cellIterator.next(); - - // Read target gene - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // Skip entrez gene - cellIterator.next(); - // Skip species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList<>(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmed - cell = cellIterator.next(); - String pubmed; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { -// pubmed = String.valueOf(cell.getNumericCellValue()); - pubmed = Integer.toString(Double.valueOf(cell.getNumericCellValue()).intValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } - } - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { String key = geneName + MIRTARBASE_SUFFIX; return rocksDbManager.getMirnaTargets(rocksdb, key); } - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - protected void indexCanonical(Path canonocalFile) throws IOException, RocksDBException { // Gene Transcript Canonical // ENSG00000210049.1 ENST00000387314.1 1 @@ -652,4 +387,30 @@ public String getCanonical(String transcriptId) throws RocksDBException, IOExcep } return new String(bytes); } + + // Implementation of the MirBaseParserCallback function + public class MirBaseCallback implements MirBaseParserCallback { + + private RocksDB rocksDB; + private RocksDbManager rocksDbManager; + private Logger logger; + + public MirBaseCallback(RocksDB rocksDB, RocksDbManager rocksDbManager) { + this.rocksDB = rocksDB; + this.rocksDbManager = rocksDbManager; + this.logger = LoggerFactory.getLogger(this.getClass()); + } + + @Override + public boolean processMiRnaGene(MiRnaGene miRnaGene) { + try { + rocksDbManager.update(rocksdb, miRnaGene.getId() + MIRBASE_SUFFIX, miRnaGene); + } catch (JsonProcessingException | RocksDBException e) { + logger.warn("Something wrong happened when processing miRNA gene {}: {}", miRnaGene.getId(), + StringUtils.join(e.getStackTrace(), "\t")); + return false; + } + return true; + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..970f73e05a 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -16,904 +16,54 @@ package org.opencb.cellbase.lib.builders; -import htsjdk.tribble.readers.TabixReader; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.feature.gff.Gff2; -import org.opencb.biodata.formats.feature.gtf.Gtf; -import org.opencb.biodata.formats.feature.gtf.io.GtfReader; -import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.*; -import org.opencb.biodata.tools.sequence.FastaIndex; -import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.rocksdb.RocksDBException; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class GeneBuilder extends CellBaseBuilder { +import static org.opencb.cellbase.lib.EtlCommons.*; - private Map transcriptDict; - private Map exonDict; +public class GeneBuilder extends CellBaseBuilder { - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; - private Path tso500File; - private Path eglhHaemOncFile; - private boolean flexibleGTFParsing; + private EnsemblGeneBuilder ensemblGeneBuilder; + private RefSeqGeneBuilder refSeqGeneBuilder; - // source for genes is either ensembl or refseq - private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); - private SpeciesConfiguration speciesConfiguration; + public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing) + throws CellBaseException { + super(null); - private int geneCounter; - private ArrayList geneList; - private String geneName; - private int transcriptCounter; - private ArrayList transcriptList; - private String transcriptName; - private int exonCounter; - private String feature; - private Gtf nextGtfToReturn; + // Create Ensembl gene builder + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), + ENSEMBL_GENE_BASENAME); + this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, + ensemblGeneSerializer); - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - CellBaseSerializer serializer) throws CellBaseException { - this(geneDirectoryPath, genomeSequenceFastaFile, speciesConfiguration, false, serializer); + // Create RefSeq gene builder + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), + REFSEQ_GENE_BASENAME); + this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, refSeqGeneSerializer); } - public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, SpeciesConfiguration speciesConfiguration, - boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { - this(null, geneDirectoryPath.resolve("description.txt"), - geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), - geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), - geneDirectoryPath.resolve("idmapping_selected.tab.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), - geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz.tbi"), - geneDirectoryPath.resolve("allgenes_updown_in_organism_part.tab.gz"), - geneDirectoryPath.resolve("dgidb.tsv"), - geneDirectoryPath.resolve("phenotype_to_genes.txt"), - geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"), - geneDirectoryPath.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz"), - geneDirectoryPath.resolve("goa_human.gaf.gz"), - geneDirectoryPath.getParent().resolve("regulation/miRNA.xls"), - geneDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"), - geneDirectoryPath.resolve("cancer-gene-census.tsv"), - geneDirectoryPath.resolve("hotspots_v2.xls"), - geneDirectoryPath.resolve("ensembl_canonical.txt"), - geneDirectoryPath.resolve("TSO500_transcripts.txt"), - geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"), - genomeSequenceFastaFile, - speciesConfiguration, flexibleGTFParsing, serializer); + public void check() throws Exception { + // Check Ensembl requirements + ensemblGeneBuilder.check(); - getGtfFileFromGeneDirectoryPath(geneDirectoryPath); - getProteinFastaFileFromGeneDirectoryPath(geneDirectoryPath); - getCDnaFastaFileFromGeneDirectoryPath(geneDirectoryPath); - } - - public GeneBuilder(Path gtfFile, Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, - Path lrgFile, Path uniprotIdMappingFile, Path tfbsFile, Path tabixFile, Path geneExpressionFile, - Path geneDrugFile, Path hpoFile, Path disgenetFile, Path gnomadFile, - Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneCensusFile, - Path cancerHostpotFile, Path ensemblCanonicalFile, Path tso500File, Path eglhHaemOncFile, - Path genomeSequenceFilePath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, - CellBaseSerializer serializer) { - super(serializer); - - this.gtfFile = gtfFile; - this.geneDescriptionFile = geneDescriptionFile; - this.xrefsFile = xrefsFile; - this.hgncFile = hgncFile; - this.maneFile = maneFile; - this.lrgFile = lrgFile; - this.uniprotIdMappingFile = uniprotIdMappingFile; - this.tfbsFile = tfbsFile; - this.tabixFile = tabixFile; - this.geneExpressionFile = geneExpressionFile; - this.geneDrugFile = geneDrugFile; - this.hpoFile = hpoFile; - this.disgenetFile = disgenetFile; - this.gnomadFile = gnomadFile; - this.geneOntologyAnnotationFile = geneOntologyAnnotationFile; - this.miRBaseFile = miRBaseFile; - this.miRTarBaseFile = miRTarBaseFile; - this.cancerGeneCensusFile = cancerGeneCensusFile; - this.cancerHostpotFile = cancerHostpotFile; - this.ensemblCanonicalFile = ensemblCanonicalFile; - this.tso500File = tso500File; - this.eglhHaemOncFile = eglhHaemOncFile; - this.genomeSequenceFilePath = genomeSequenceFilePath; - this.speciesConfiguration = speciesConfiguration; - this.flexibleGTFParsing = flexibleGTFParsing; - - transcriptDict = new HashMap<>(250000); - exonDict = new HashMap<>(8000000); + // Check RefSeq requirements + refSeqGeneBuilder.check(); } + @Override public void parse() throws Exception { - Gene gene = null; - Transcript transcript; - Exon exon = null; - int cdna = 1; - int cds = 1; - EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(gtfFile.getParent()); - - try { - // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile, - tso500File, eglhHaemOncFile); - - TabixReader tabixReader = null; - if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { - logger.error("Tfbs or tabix file not found. Download them and try again."); - } else { - tabixReader = new TabixReader(tfbsFile.toAbsolutePath().toString(), tabixFile.toAbsolutePath().toString()); - } - - // Preparing the fasta file for fast accessing -// System.out.println("genomeSequenceFilePath.toString() = " + genomeSequenceFilePath.toString()); - FastaIndex fastaIndex = new FastaIndex(genomeSequenceFilePath); - - // Empty transcript and exon dictionaries - transcriptDict.clear(); - exonDict.clear(); - logger.info("Parsing gtf..."); - GtfReader gtfReader = new GtfReader(gtfFile); - - // Gene->Transcript->Feature->GTF line - Map>> gtfMap = null; - if (flexibleGTFParsing) { - gtfMap = loadGTFMap(gtfReader); - initializePointers(gtfMap); - } - - Gtf gtf; - while ((gtf = getGTFEntry(gtfReader, gtfMap)) != null) { - - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - String geneId = gtf.getAttributes().get("gene_id"); - String transcriptId = gtf.getAttributes().get("transcript_id"); - String geneName = gtf.getAttributes().get("gene_name"); - if (newGene(gene, geneId)) { - // If new geneId is different from the current then we must serialize before data new gene - if (gene != null) { - serializer.serialize(gene); - } - - GeneAnnotation geneAnnotation = new GeneAnnotation(indexer.getExpression(geneId), indexer.getDiseases(geneName), - indexer.getDrugs(geneName), indexer.getConstraints(geneId), indexer.getMirnaTargets(geneName), - indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); - - gene = new Gene(geneId, geneName, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), gtf.getAttributes().get("gene_version"), - gtf.getAttributes().get("gene_biotype"), "KNOWN", SOURCE, indexer.getDescription(geneId), - new ArrayList<>(), indexer.getMirnaGene(transcriptId), geneAnnotation); - } - - // Check if Transcript exist in the Gene Set of transcripts - if (!transcriptDict.containsKey(transcriptId)) { - transcript = getTranscript(gene, indexer, tabixReader, gtf, transcriptId); - } else { - transcript = gene.getTranscripts().get(transcriptDict.get(transcriptId)); - } - - // At this point gene and transcript objects are set up - // Update gene and transcript genomic coordinates, start must be the - // lower, and end the higher - updateTranscriptAndGeneCoords(transcript, gene, gtf); - - String transcriptIdWithoutVersion = transcript.getId().split("\\.")[0]; - if (gtf.getFeature().equalsIgnoreCase("exon")) { - // Obtaining the exon sequence - String exonId = gtf.getAttributes().get("exon_id") + "." + gtf.getAttributes().get("exon_version"); - String exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); - - exon = new Exon(exonId, gtf.getSequenceName().replaceFirst("chr", ""), - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), 0, 0, 0, 0, 0, 0, -1, Integer.parseInt(gtf - .getAttributes().get("exon_number")), exonSequence); - transcript.getExons().add(exon); - - exonDict.put(transcriptIdWithoutVersion + "_" + exon.getExonNumber(), exon); - if (gtf.getAttributes().get("exon_number").equals("1")) { - cdna = 1; - cds = 1; - } else { - // with every exon we update cDNA length with the previous exon length - cdna += exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getEnd() - - exonDict.get(transcriptIdWithoutVersion + "_" + (exon.getExonNumber() - 1)).getStart() + 1; - } - } else { - exon = exonDict.get(transcriptIdWithoutVersion + "_" + exon.getExonNumber()); - if (gtf.getFeature().equalsIgnoreCase("CDS")) { - // Protein ID is only present in CDS lines - String proteinId = gtf.getAttributes().get("protein_id") != null - ? gtf.getAttributes().get("protein_id") + "." + gtf.getAttributes().get("protein_version") - : ""; - transcript.setProteinId(proteinId); - transcript.setProteinSequence(indexer.getProteinFasta(proteinId)); - - if (gtf.getStrand().equals("+") || gtf.getStrand().equals("1")) { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - - // cDNA coordinates - exon.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - transcript.setCdnaCodingStart(gtf.getStart() - exon.getStart() + cdna); - } - // strand - - } else { - // CDS states the beginning of coding start - exon.setGenomicCodingStart(gtf.getStart()); - exon.setGenomicCodingEnd(gtf.getEnd()); - // cDNA coordinates - // cdnaCodingStart points to the same base position than genomicCodingEnd - exon.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - // Set cdnaCodingEnd to prevent those cases without stop_codon - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsStart(cds); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // increment in the coding length - cds += gtf.getEnd() - gtf.getStart() + 1; - transcript.setCdsLength(cds - 1); // Set cdnaCodingEnd to prevent those cases without stop_codon - exon.setPhase(Integer.parseInt(gtf.getFrame())); - - if (transcript.getGenomicCodingStart() == 0 || transcript.getGenomicCodingStart() > gtf.getStart()) { - transcript.setGenomicCodingStart(gtf.getStart()); - } - if (transcript.getGenomicCodingEnd() == 0 || transcript.getGenomicCodingEnd() < gtf.getEnd()) { - transcript.setGenomicCodingEnd(gtf.getEnd()); - } - // only first time - if (transcript.getCdnaCodingStart() == 0) { - // cdnaCodingStart points to the same base position than genomicCodingEnd - transcript.setCdnaCodingStart(exon.getEnd() - gtf.getEnd() + cdna); - } - } - - } -// if (gtf.getFeature().equalsIgnoreCase("start_codon")) { -// // nothing to do -// System.out.println("Empty block, this should be redesigned"); -// } - if (gtf.getFeature().equalsIgnoreCase("stop_codon")) { - // setCdnaCodingEnd = false; // stop_codon found, cdnaCodingEnd will be set here, - // no need to set it at the beginning of next feature - if (exon.getStrand().equals("+")) { - updateStopCodingDataPositiveExon(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingEnd(gtf.getEnd()); - transcript.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - transcript.setCdsLength(cds - 1); - - } else { - updateNegativeExonCodingData(exon, cdna, cds, gtf); - - cds += gtf.getEnd() - gtf.getStart(); - // If stop_codon appears, overwrite values - transcript.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - transcript.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - transcript.setCdsLength(cds - 1); - } - } - } - } - - // last gene must be serialized - serializer.serialize(gene); - - // cleaning - gtfReader.close(); - serializer.close(); - fastaIndex.close(); - indexer.close(); - } catch (Exception e) { - indexer.close(); - throw e; - } - } - - private Transcript getTranscript(Gene gene, EnsemblGeneBuilderIndexer indexer, TabixReader tabixReader, Gtf gtf, String transcriptId) - throws IOException, RocksDBException { - Map gtfAttributes = gtf.getAttributes(); - - // To match Ensembl, we set the ID as transcript+version. This also matches the Ensembl website. - String transcriptIdWithVersion = transcriptId + "." + gtfAttributes.get("transcript_version"); - String biotype = gtfAttributes.get("transcript_biotype") != null ? gtfAttributes.get("transcript_biotype") : ""; - String transcriptChromosome = gtf.getSequenceName().replaceFirst("chr", ""); - List transcriptTfbses = getTranscriptTfbses(gtf, transcriptChromosome, tabixReader); - - List ontologyAnnotations = getOntologyAnnotations(indexer.getXrefs(transcriptId), indexer); - TranscriptAnnotation transcriptAnnotation = new TranscriptAnnotation(ontologyAnnotations, indexer.getConstraints(transcriptId)); - - Transcript transcript = new Transcript(transcriptIdWithVersion, gtfAttributes.get("transcript_name"), transcriptChromosome, - gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, "KNOWN", - 0, 0, 0, 0, 0, - indexer.getCdnaFasta(transcriptIdWithVersion), "", "", "", - gtfAttributes.get("transcript_version"), SOURCE, new ArrayList<>(), indexer.getXrefs(transcriptId), transcriptTfbses, - new HashSet<>(), transcriptAnnotation); - - // Adding Ids appearing in the GTF to the xrefs is required, since for some unknown reason the ENSEMBL - // Perl API often doesn't return all genes resulting in an incomplete xrefs.txt file. We must ensure - // that the xrefs array contains all ids present in the GTF file - addGtfXrefs(transcript, gene, gtfAttributes); - - // Add HGNC ID mappings, with this we can know which Ensembl and Refseq transcripts match to HGNC ID - String hgncId = indexer.getHgncId(gene.getName()); - if (StringUtils.isNotEmpty(hgncId)) { - transcript.getXrefs().add(new Xref(hgncId, "hgnc_id", "HGNC ID")); - } - - // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("refseq", "refseq_protein")) { - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, suffix); - if (StringUtils.isNotEmpty(maneRefSeq)) { - transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, - "MANE Select RefSeq" + (suffix.contains("_") ? " Protein" : ""))); - } - } - - // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(lrgRefSeq)) { - transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_refseq", "LRG RefSeq")); - } - - // Add Flags - // 1. GTF tags - String tags = gtf.getAttributes().get("tag"); - if (StringUtils.isNotEmpty(tags)) { - transcript.getFlags().addAll(Arrays.asList(tags.split(","))); - } - // 2. TSL - String supportLevel = gtfAttributes.get("transcript_support_level"); - if (StringUtils.isNotEmpty(supportLevel)) { - // split on space so "5 (assigned to previous version 3)" and "5" both become "TSL:5" - String truncatedSupportLevel = supportLevel.split(" ")[0]; - transcript.getFlags().add("TSL:" + truncatedSupportLevel); - } - // 3. MANE Flag - String maneFlag = indexer.getMane(transcriptIdWithVersion, "flag"); - if (StringUtils.isNotEmpty(maneFlag)) { - transcript.getFlags().add(maneFlag); - } - // 4. LRG Flag - String lrg = indexer.getLrg(transcriptIdWithVersion, "ensembl"); - if (StringUtils.isNotEmpty(lrg)) { - transcript.getFlags().add("LRG"); - } else { - for (Xref xref : transcript.getXrefs()) { - if (xref.getId().startsWith("LRG_") && xref.getId().contains("t")) { - transcript.getFlags().add("LRG"); - } - } - } - // 5. Ensembl Canonical - String canonicalFlag = indexer.getCanonical(transcriptIdWithVersion); - if (StringUtils.isNotEmpty(canonicalFlag)) { - transcript.getFlags().add(canonicalFlag); - } - - // 6. TSO500 and EGLH HaemOnc - String maneRefSeq = indexer.getMane(transcriptIdWithVersion, "refseq"); - if (StringUtils.isNotEmpty(maneRefSeq)) { - String tso500Flag = indexer.getTSO500(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(tso500Flag)) { - transcript.getFlags().add(tso500Flag); - } - - String eglhHaemOncFlag = indexer.getEGLHHaemOnc(maneRefSeq.split("\\.")[0]); - if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { - transcript.getFlags().add(eglhHaemOncFlag); - } - } - - gene.getTranscripts().add(transcript); - - // Do not change order!! size()-1 is the index of the transcript ID - transcriptDict.put(transcriptId, gene.getTranscripts().size() - 1); - return transcript; - } - - private List getOntologyAnnotations(List xrefs, EnsemblGeneBuilderIndexer indexer) - throws IOException, RocksDBException { - if (xrefs == null || indexer == null) { - return null; - } - List annotations = new ArrayList<>(); - for (Xref xref : xrefs) { - if (xref.getDbName().equals("uniprotkb_acc")) { - String key = xref.getId(); - if (key != null && indexer.getOntologyAnnotations(key) != null) { - annotations.addAll(indexer.getOntologyAnnotations(key)); - } - } - } - return annotations; - } - - private void updateNegativeExonCodingData(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingStart(gtf.getStart()); - // cdnaCodingEnd points to the same base position than genomicCodingStart - exon.setCdnaCodingEnd(exon.getEnd() - gtf.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); - - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingEnd() == 0) { - exon.setGenomicCodingEnd(exon.getGenomicCodingStart() + 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void updateStopCodingDataPositiveExon(Exon exon, int cdna, int cds, Gtf gtf) { - // we need to increment 3 nts, the stop_codon length. - exon.setGenomicCodingEnd(gtf.getEnd()); - exon.setCdnaCodingEnd(gtf.getEnd() - exon.getStart() + cdna); - exon.setCdsEnd(gtf.getEnd() - gtf.getStart() + cds); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); - // If the STOP codon corresponds to the first three nts of the exon then no CDS will be defined - // in the gtf -as technically the STOP codon is non-coding- and we must manually set coding - // starts - if (exon.getGenomicCodingStart() == 0) { - exon.setGenomicCodingStart(exon.getGenomicCodingEnd() - 2); - } - if (exon.getCdnaCodingStart() == 0) { - exon.setCdnaCodingStart(exon.getCdnaCodingEnd() - 2); - } - if (exon.getCdsStart() == 0) { - exon.setCdsStart(exon.getCdsEnd() - 2); - } - } - - private void addGtfXrefs(Transcript transcript, Gene gene, Map gtfAttributes) { - if (transcript.getXrefs() == null) { - transcript.setXrefs(new ArrayList<>()); - } - - transcript.getXrefs().add(new Xref(gene.getId(), "ensembl_gene", "Ensembl Gene")); - transcript.getXrefs().add(new Xref(transcript.getId(), "ensembl_transcript", "Ensembl Transcript")); - - // Some non-coding genes do not have Gene names - if (StringUtils.isNotEmpty(gene.getName())) { - transcript.getXrefs().add(new Xref(gene.getName(), "hgnc_symbol", "HGNC Symbol")); - transcript.getXrefs().add(new Xref(transcript.getName(), "ensembl_transcript_name", "Ensembl Transcript Name")); - } - - if (gtfAttributes.get("ccds_id") != null) { - transcript.getXrefs().add(new Xref(gtfAttributes.get("ccds_id"), "ccds_id", "CCDS")); - } - } - - private void initializePointers(Map>> gtfMap) { - geneCounter = 0; - geneList = new ArrayList<>(gtfMap.keySet()); - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - - private Gtf getGTFEntry(GtfReader gtfReader, Map>> gtfMap) throws FileFormatException { - // Flexible parsing is deactivated, return next line - if (gtfMap == null) { - return gtfReader.read(); - // Flexible parsing activated, carefully select next line to return - } else { - // No more genes/features to return - if (nextGtfToReturn == null) { - return null; - } - Gtf gtfToReturn = nextGtfToReturn; - if (feature.equals("exon")) { -// gtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - if (gtfMap.get(geneName).get(transcriptName).containsKey("cds")) { - nextGtfToReturn = getExonCDSLine(((Gtf) ((List) gtfMap.get(geneName) - .get(transcriptName).get("exon")).get(exonCounter)).getStart(), - ((Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter)).getEnd(), - (List) gtfMap.get(geneName).get(transcriptName).get("cds")); - if (nextGtfToReturn != null) { - feature = "cds"; - return gtfToReturn; - } - } - // if no cds was found for this exon, get next exon - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("cds") || feature.equals("stop_codon")) { - getFeatureFollowsExon(gtfMap); - return gtfToReturn; - } - if (feature.equals("start_codon")) { - feature = "stop_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("stop_codon"); - return gtfToReturn; - } - // The only accepted features that should appear in the gtfMap are exon, cds, start_codon and stop_codon - throw new FileFormatException("Execution cannot reach this point"); - } - } - - private Gtf getExonCDSLine(Integer exonStart, Integer exonEnd, List cdsList) { - for (Object cdsObject : cdsList) { - int cdsStart = ((Gtf) cdsObject).getStart(); - int cdsEnd = ((Gtf) cdsObject).getEnd(); - if (cdsStart <= exonEnd && cdsEnd >= exonStart) { - return (Gtf) cdsObject; - } - } - return null; - } - - private void getFeatureFollowsExon(Map>> gtfMap) { - exonCounter++; - if (exonCounter == ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).size() - || feature.equals("stop_codon")) { - // If last returned feature was a stop_codon or no start_codon is provided for this transcript, - // next transcript must be selected - if (!feature.equals("stop_codon") && gtfMap.get(geneName).get(transcriptName).containsKey("start_codon")) { - feature = "start_codon"; - nextGtfToReturn = (Gtf) gtfMap.get(geneName).get(transcriptName).get("start_codon"); - } else { - transcriptCounter++; - // No more transcripts in this gene, check if there are more genes - if (transcriptCounter == gtfMap.get(geneName).size()) { - geneCounter++; - // No more genes available, end parsing - if (geneCounter == gtfMap.size()) { - nextGtfToReturn = null; - feature = null; - // Still more genes to parse, select next one - } else { - geneName = geneList.get(geneCounter); - transcriptCounter = 0; - transcriptList = new ArrayList<>(gtfMap.get(geneName).keySet()); - } - } - // Check if a new gene was selected - null would indicate there're no more genes - if (nextGtfToReturn != null) { - transcriptName = transcriptList.get(transcriptCounter); - exonCounter = 0; - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - } else { - feature = "exon"; - nextGtfToReturn = (Gtf) ((List) gtfMap.get(geneName).get(transcriptName).get("exon")).get(exonCounter); - } - } - - private Map>> loadGTFMap(GtfReader gtfReader) throws FileFormatException { - Map>> gtfMap = new HashMap<>(); - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - if (gtf.getFeature().equals("gene") || gtf.getFeature().equals("transcript") - || gtf.getFeature().equals("UTR") || gtf.getFeature().equals("Selenocysteine")) { - continue; - } - - // Get GTF lines associated with this gene - create a new Map of GTF entries if it's a new gene - String geneId = gtf.getAttributes().get("gene_id"); - // Transcript -> feature -> GTF line - Map> gtfMapGeneEntry; - if (gtfMap.containsKey(geneId)) { - gtfMapGeneEntry = gtfMap.get(geneId); - } else { - gtfMapGeneEntry = new HashMap(); - gtfMap.put(geneId, gtfMapGeneEntry); - } - - // Get GTF lines associated with this transcript - create a new Map of GTF entries if it's a new gene - String transcriptId = gtf.getAttributes().get("transcript_id"); - Map gtfMapTranscriptEntry; - if (gtfMapGeneEntry.containsKey(transcriptId)) { - gtfMapTranscriptEntry = gtfMapGeneEntry.get(transcriptId); - } else { - gtfMapTranscriptEntry = new HashMap(); - gtfMapGeneEntry.put(transcriptId, gtfMapTranscriptEntry); - } - - addGTFLineToGTFMap(gtfMapTranscriptEntry, gtf); - - } - - // Exon number is mandatory for the parser to be able to properly generate the gene data model - if (!exonNumberPresent(gtfMap)) { - setExonNumber(gtfMap); - } - - return gtfMap; - } - - private boolean exonNumberPresent(Map>> gtfMap) { - Map> geneGtfMap = gtfMap.get(gtfMap.keySet().iterator().next()); - return ((Gtf) ((List) geneGtfMap.get(geneGtfMap.keySet().iterator().next()).get("exon")).get(0)) - .getAttributes().containsKey("exon_number"); - } - - private void setExonNumber(Map>> gtfMap) { - for (String gene : gtfMap.keySet()) { - for (String transcript : gtfMap.get(gene).keySet()) { - List exonList = (List) gtfMap.get(gene).get(transcript).get("exon"); - Collections.sort(exonList, (e1, e2) -> Integer.valueOf(e1.getStart()).compareTo(e2.getStart())); - if (exonList.get(0).getStrand().equals("+")) { - int exonNumber = 1; - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber++; - } - } else { - int exonNumber = exonList.size(); - for (Gtf gtf : exonList) { - gtf.getAttributes().put("exon_number", String.valueOf(exonNumber)); - exonNumber--; - } - } - } - } - } - - private void addGTFLineToGTFMap(Map gtfMapTranscriptEntry, Gtf gtf) { - // Add exon/cds GTF line to the corresponding gene entry in the map - String featureType = gtf.getFeature().toLowerCase(); - if (featureType.equals("exon") || featureType.equals("cds")) { - List gtfList; - // Check if there were exons already stored - if (gtfMapTranscriptEntry.containsKey(featureType)) { - gtfList = (List) gtfMapTranscriptEntry.get(featureType); - } else { - gtfList = new ArrayList<>(); - gtfMapTranscriptEntry.put(featureType, gtfList); - } - gtfList.add(gtf); - // Only one start/stop codon can be stored per transcript - no need to check if the "start_codon"/"stop_codon" - // keys are already there - } else if (featureType.equals("start_codon") || featureType.equals("stop_codon")) { - gtfMapTranscriptEntry.put(featureType, gtf); - } - } + // Check folders and files before building + check(); - private List getTranscriptTfbses(Gtf transcript, String chromosome, TabixReader tabixReader) throws IOException { - if (tabixReader == null) { - return null; - } - List transcriptTfbses = null; - - int transcriptStart = transcript.getStart(); - int transcriptEnd = transcript.getEnd(); - - - String line; - TabixReader.Iterator iter = tabixReader.query(chromosome, transcriptStart, transcriptEnd); - while ((line = iter.next()) != null) { - String[] elements = line.split("\t"); - - String sequenceName = elements[0]; - String source = elements[1]; - String feature = elements[2]; - int start = Integer.parseInt(elements[3]); - int end = Integer.parseInt(elements[4]); - String score = elements[5]; - String strand = elements[6]; - String frame = elements[7]; - String attribute = elements[8]; - - if (strand.equals(transcript.getStrand())) { - continue; - } - - if (transcript.getStrand().equals("+")) { - if (start > transcript.getStart() + 500) { - break; - } else if (end > transcript.getStart() - 2500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } else { - // transcript in negative strand - if (start > transcript.getEnd() + 2500) { - break; - } else if (start > transcript.getEnd() - 500) { - Gff2 tfbs = new Gff2(sequenceName, source, feature, start, end, score, strand, frame, attribute); - transcriptTfbses = addTranscriptTfbstoList(tfbs, transcript, chromosome, transcriptTfbses); - } - } - } - - return transcriptTfbses; - } - - protected List addTranscriptTfbstoList(Gff2 tfbs, Gtf transcript, String chromosome, - List transcriptTfbses) { - if (transcriptTfbses == null) { - transcriptTfbses = new ArrayList<>(); - } - - // binding_matrix_stable_id=ENSPFM0542;epigenomes_with_experimental_evidence=SK-N.%2CMCF-7%2CH1-hESC_3%2CHCT116; - // stable_id=ENSM00208374688;transcription_factor_complex=TEAD4::ESRRB - String[] attributes = tfbs.getAttribute().split(";"); - - String id = null; - String pfmId = null; - List transciptionFactors = null; - - for (String attributePair : attributes) { - String[] attributePairArray = attributePair.split("="); - switch(attributePairArray[0]) { - case "binding_matrix_stable_id": - pfmId = attributePairArray[1]; - break; - case "stable_id": - id = attributePairArray[1]; - break; - case "transcription_factor_complex": - transciptionFactors = Arrays.asList(attributePairArray[1].split("(::)|(%2C)")); - break; - default: - break; - } - } - - transcriptTfbses.add(new TranscriptTfbs(id, pfmId, tfbs.getFeature(), transciptionFactors, chromosome, tfbs.getStart(), - tfbs.getEnd(), getRelativeTranscriptTfbsStart(tfbs, transcript), getRelativeTranscriptTfbsEnd(tfbs, transcript), - Float.parseFloat(tfbs.getScore()))); - return transcriptTfbses; - } - - private Integer getRelativeTranscriptTfbsStart(Gff2 tfbs, Gtf transcript) { - Integer relativeStart; - if (transcript.getStrand().equals("+")) { - if (tfbs.getStart() < transcript.getStart()) { - relativeStart = tfbs.getStart() - transcript.getStart(); - } else { - relativeStart = tfbs.getStart() - transcript.getStart() + 1; - } - } else { - // negative strand transcript - if (tfbs.getEnd() > transcript.getEnd()) { - relativeStart = transcript.getEnd() - tfbs.getEnd(); - } else { - relativeStart = transcript.getEnd() - tfbs.getEnd() + 1; - } - } - return relativeStart; - } - - private Integer getRelativeTranscriptTfbsEnd(Gff2 tfbs, Gtf transcript) { - Integer relativeEnd; - if (transcript.getStrand().equals("+")) { - if (tfbs.getEnd() < transcript.getStart()) { - relativeEnd = tfbs.getEnd() - transcript.getStart(); - } else { - relativeEnd = tfbs.getEnd() - transcript.getStart() + 1; - } - } else { - if (tfbs.getStart() > transcript.getEnd()) { - relativeEnd = transcript.getEnd() - tfbs.getStart(); - } else { - relativeEnd = transcript.getEnd() - tfbs.getStart() + 1; - } - } - return relativeEnd; - } - - - - private boolean newGene(Gene previousGene, String newGeneId) { - return previousGene == null || !newGeneId.equals(previousGene.getId()); - } - - private void updateTranscriptAndGeneCoords(Transcript transcript, Gene gene, Gtf gtf) { - if (transcript.getStart() > gtf.getStart()) { - transcript.setStart(gtf.getStart()); - } - if (transcript.getEnd() < gtf.getEnd()) { - transcript.setEnd(gtf.getEnd()); - } - if (gene.getStart() > gtf.getStart()) { - gene.setStart(gtf.getStart()); - } - if (gene.getEnd() < gtf.getEnd()) { - gene.setEnd(gtf.getEnd()); - } - } - - private void getGtfFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } - - private void getProteinFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".pep.all.fa") || fileName.endsWith(".pep.all.fa.gz")) { - proteinFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } - } + // Build Ensembl/RefSeq genes + ensemblGeneBuilder.parse(); + refSeqGeneBuilder.parse(); - private void getCDnaFastaFileFromGeneDirectoryPath(Path geneDirectoryPath) { - for (String fileName : geneDirectoryPath.toFile().list()) { - if (fileName.endsWith(".cdna.all.fa") || fileName.endsWith(".cdna.all.fa.gz")) { - cDnaFastaFile = geneDirectoryPath.resolve(fileName); - break; - } - } + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 285236ba60..b8941cc448 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -24,9 +24,10 @@ import org.opencb.biodata.formats.sequence.fasta.Fasta; import org.opencb.biodata.formats.sequence.fasta.io.FastaReader; import org.opencb.biodata.models.clinical.ClinicalProperty; -import org.opencb.biodata.models.core.CancerHotspot; -import org.opencb.biodata.models.core.CancerHotspotVariant; -import org.opencb.biodata.models.core.GeneCancerAssociation; +import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; +import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; @@ -42,8 +43,14 @@ import java.util.*; import java.util.stream.Collectors; +import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE; + public class GeneBuilderIndexer { + public static final String ROCKSDB_FOLDER = "rocksdb.idx"; + protected RocksDB rocksdb; protected RocksDbManager rocksDbManager; protected Logger logger; @@ -69,7 +76,7 @@ public GeneBuilderIndexer(Path genePath) { private void init(Path genePath) { rocksDbManager = new RocksDbManager(); - dbLocation = genePath.resolve("integration.idx").toString(); + dbLocation = genePath.resolve(ROCKSDB_FOLDER).toString(); rocksdb = rocksDbManager.getDBConnection(dbLocation); dbOption = new Options().setCreateIfMissing(true); @@ -77,18 +84,14 @@ private void init(Path genePath) { } protected void indexCdnaSequences(Path cDnaFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading RefSeq's cDNA sequences..."); - FileUtils.checkPath(cDnaFastaFile); - if (Files.size(cDnaFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(cDnaFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("RefSeq's cDNA sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, cDnaFastaFile); + FastaReader fastaReader = new FastaReader(cDnaFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + CDNA_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, cDnaFastaFile); } public String getCdnaFasta(String id) throws RocksDBException { @@ -96,18 +99,14 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { - logger.info("Loading ENSEMBL's protein sequences..."); - FileUtils.checkPath(proteinFastaFile); - if (Files.size(proteinFastaFile) > 0) { - FastaReader fastaReader = new FastaReader(proteinFastaFile); - Fasta fasta; - while ((fasta = fastaReader.read()) != null) { - rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); - } - fastaReader.close(); - } else { - logger.warn("ENSEMBL's protein sequences not loaded"); + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); + FastaReader fastaReader = new FastaReader(proteinFastaFile); + Fasta fasta; + while ((fasta = fastaReader.read()) != null) { + rocksDbManager.update(rocksdb, fasta.getId() + PROTEIN_SEQUENCE_SUFFIX, fasta.getSeq()); } + fastaReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, proteinFastaFile); } protected String getProteinFasta(String id) throws RocksDBException { @@ -115,22 +114,18 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { - // #hgnc_id symbol name locus_group locus_type status location location_sortable ... - logger.info("Indexing HGNC ID mapping data ..."); - - // We only need the first two columns: hgnc_id -> symbol - if (hgncMappingFile != null && Files.exists(hgncMappingFile) && Files.size(hgncMappingFile) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); - line = bufferedReader.readLine(); - } + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { + String line = bufferedReader.readLine(); + // We only need the first two columns: hgnc_id -> symbol + // #hgnc_id symbol name locus_group locus_type status location location_sortable ... + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[1] + HGNC_ID_SUFFIX, fields[0]); + line = bufferedReader.readLine(); } - } else { - logger.warn("HGNC ID mapping file " + hgncMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, hgncMappingFile); } public String getHgncId(String id) throws RocksDBException { @@ -138,29 +133,25 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); + int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; + // #NCBI_GeneID Ensembl_Gene HGNC_ID symbol name RefSeq_nuc RefSeq_prot Ensembl_nuc Ensembl_prot // MANE_status GRCh38_chr chr_start chr_end chr_strand - logger.info("Indexing MANE mapping data ..."); - - if (maneMappingFile != null && Files.exists(maneMappingFile) && Files.size(maneMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 7 : 5; -// BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - String[] fields = line.split("\t", -1); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); - rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(maneMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + String[] fields = line.split("\t", -1); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq", fields[5]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_refseq_protein", fields[6]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl", fields[7]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_ensembl_protein", fields[8]); + rocksDbManager.update(rocksdb, fields[idColumn] + MANE_SUFFIX + "_flag", fields[9]); - line = bufferedReader.readLine(); - } + line = bufferedReader.readLine(); } - } else { - logger.warn("MANE mapping file " + maneMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, maneMappingFile); } public String getMane(String id, String field) throws RocksDBException { @@ -168,30 +159,27 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); + // # Last modified: 30-03-2021@22:00:06 // # LRG HGNC_SYMBOL REFSEQ_GENOMIC LRG_TRANSCRIPT REFSEQ_TRANSCRIPT ENSEMBL_TRANSCRIPT CCDS // LRG_1 COL1A1 NG_007400.1 t1 NM_000088.3 ENST00000225964.10 CCDS11561.1 - logger.info("Indexing LRG mapping data ..."); - - if (lrgMappingFile != null && Files.exists(lrgMappingFile) && Files.size(lrgMappingFile) > 0) { - int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - String id = fields[idColumn]; - if (StringUtils.isNotEmpty(id) && !id.equals("-")) { - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); - rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); - } + int idColumn = referenceId.equalsIgnoreCase("ensembl") ? 5 : 4; + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(lrgMappingFile)) { + String line = bufferedReader.readLine(); + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + String id = fields[idColumn]; + if (StringUtils.isNotEmpty(id) && !id.equals("-")) { + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_refseq", fields[4]); + rocksDbManager.update(rocksdb, id + LRG_SUFFIX + "_ensembl", fields[5]); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("LRG mapping file " + lrgMappingFile + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, lrgMappingFile); } public String getLrg(String id, String field) throws RocksDBException { @@ -199,6 +187,8 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cgcFile); + Map tissuesMap = new HashMap<>(); tissuesMap.put("E", "epithelial"); tissuesMap.put("L", "leukaemia/lymphoma"); @@ -224,10 +214,8 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx mutationTypesMap.put("Mis", "missense"); mutationTypesMap.put("PromoterMis", "missense"); - logger.info("Indexing CANCER GENE CENSUS data ..."); - if (cgcFile != null && Files.exists(cgcFile) && Files.size(cgcFile) > 0) { + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile)) { // Skip the first header line - BufferedReader bufferedReader = FileUtils.newBufferedReader(cgcFile); bufferedReader.readLine(); GeneCancerAssociation cancerGeneAssociation; @@ -237,9 +225,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx // Find Ensembl Gene Id in the last comma-separated column List synonyms = StringUtils.isNotEmpty(fields[19]) ? Arrays.stream(fields[19] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -264,44 +252,44 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx : Collections.emptyList(); List tissues = StringUtils.isNotEmpty(fields[12]) ? Arrays.stream(fields[12] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(tissuesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List modeOfInheritance = StringUtils.isNotEmpty(fields[13]) ? fields[13].equalsIgnoreCase("Dom/Rec") - ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) - : Collections.singletonList(moiMap.get(fields[13])) + ? Arrays.asList(moiMap.get("Dom"), moiMap.get("Rec")) + : Collections.singletonList(moiMap.get(fields[13])) : Collections.emptyList(); List roleInCancer = StringUtils.isNotEmpty(fields[14]) ? Arrays.stream(fields[14] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(roleInCancerMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List mutationTypes = StringUtils.isNotEmpty(fields[15]) ? Arrays.stream(fields[15] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .map(mutationTypesMap::get) .collect(Collectors.toList()) : Collections.emptyList(); List translocationPartners = StringUtils.isNotEmpty(fields[16]) ? Arrays.stream(fields[16] - .replaceAll("\"", "") - .replaceAll(" ", "") - .split(",")) + .replaceAll("\"", "") + .replaceAll(" ", "") + .split(",")) .collect(Collectors.toList()) : Collections.emptyList(); List otherSyndromes = StringUtils.isNotEmpty(fields[18]) ? Arrays.stream(fields[18] - .replaceAll("\"", "") - .split("; ")) + .replaceAll("\"", "") + .split("; ")) .collect(Collectors.toList()) : Collections.emptyList(); @@ -312,10 +300,9 @@ protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBEx rocksDbManager.update(rocksdb, fields[0] + CANCER_GENE_CENSUS_SUFFIX, cancerGeneAssociation); } } - bufferedReader.close(); - } else { - logger.warn("CANCER GENE CENSUS file " + cgcFile + " not found"); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cgcFile); } public List getCancerGeneCensus(String geneName) throws RocksDBException, IOException { @@ -324,97 +311,102 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); + // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key Map> visited = new HashMap<>(); - FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile()); - HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); - HSSFSheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - iterator.next(); - while (iterator.hasNext()) { - Row currentRow = iterator.next(); - String geneName = currentRow.getCell(0).toString(); - - if (currentRow.getCell(1).toString().contains("splice")) { - continue; - } - int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - - CancerHotspot ch = null; - // Check if ch object already exist - if (visited.containsKey(geneName)) { - for (CancerHotspot hotspot : visited.get(geneName)) { - if (hotspot.getAminoacidPosition() == aminoAcidPosition) { - ch = hotspot; - break; - } - } - } - // If not exist we create new ch - if (ch == null) { - ch = new CancerHotspot(); - ch.setScores(new HashMap<>()); - ch.setCancerTypeCount(new HashMap<>()); - ch.setOrganCount(new HashMap<>()); - ch.setVariants(new ArrayList<>()); - - // Parse new row - ch.setGeneName(geneName); - ch.setAminoacidPosition(aminoAcidPosition); - ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); - ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); - - String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); - for (String cancerCount : cancerCountSplit) { - String[] split = cancerCount.split(":"); - ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + try (FileInputStream fileInputStream = new FileInputStream(cancerHotspot.toFile())) { + HSSFWorkbook workbook = new HSSFWorkbook(fileInputStream); + HSSFSheet sheet = workbook.getSheetAt(0); + Iterator iterator = sheet.iterator(); + iterator.next(); + while (iterator.hasNext()) { + Row currentRow = iterator.next(); + String geneName = currentRow.getCell(0).toString(); + + if (currentRow.getCell(1).toString().contains("splice")) { + continue; } + int aminoAcidPosition = Integer.parseInt(currentRow.getCell(1).toString()); - String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); - for (String organCount : organCountSplit) { - String[] split = organCount.split(":"); - ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + CancerHotspot ch = null; + // Check if ch object already exist + if (visited.containsKey(geneName)) { + for (CancerHotspot hotspot : visited.get(geneName)) { + if (hotspot.getAminoacidPosition() == aminoAcidPosition) { + ch = hotspot; + break; + } + } } - ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); - ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); - ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); - ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); - ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); - ch.setAminoacidReference(currentRow.getCell(35).toString()); - ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); - ch.setCancerType(currentRow.getCell(37).toString()); + // If not exist we create new ch + if (ch == null) { + ch = new CancerHotspot(); + ch.setScores(new HashMap<>()); + ch.setCancerTypeCount(new HashMap<>()); + ch.setOrganCount(new HashMap<>()); + ch.setVariants(new ArrayList<>()); + + // Parse new row + ch.setGeneName(geneName); + ch.setAminoacidPosition(aminoAcidPosition); + ch.getScores().put("log10Pvalue", Double.parseDouble(currentRow.getCell(2).toString())); + ch.setNumMutations(Integer.parseInt(currentRow.getCell(3).toString())); + + String[] cancerCountSplit = currentRow.getCell(11).toString().split("\\|"); + for (String cancerCount : cancerCountSplit) { + String[] split = cancerCount.split(":"); + ch.getCancerTypeCount().put(split[0], Integer.parseInt(split[2])); + } - if (visited.containsKey(geneName)) { - // Gene exists but no this aminoacid position - visited.get(geneName).add(ch); - } else { - // New gene found - visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + String[] organCountSplit = currentRow.getCell(12).toString().split("\\|"); + for (String organCount : organCountSplit) { + String[] split = organCount.split(":"); + ch.getOrganCount().put(split[0], Integer.parseInt(split[2])); + } + + ch.getScores().put("mutability", Double.parseDouble(currentRow.getCell(14).toString())); + ch.getScores().put("muProtein", Double.parseDouble(currentRow.getCell(15).toString())); + ch.setAnalysis(Arrays.asList(currentRow.getCell(17).toString().split(","))); + ch.getScores().put("qvalue", Double.parseDouble(currentRow.getCell(18).toString())); + ch.getScores().put("qvaluePancan", Double.parseDouble(currentRow.getCell(20).toString())); + ch.setAminoacidReference(currentRow.getCell(35).toString()); + ch.getScores().put("qvalueCancerType", Double.parseDouble(currentRow.getCell(36).toString())); + ch.setCancerType(currentRow.getCell(37).toString()); + + if (visited.containsKey(geneName)) { + // Gene exists but no this aminoacid position + visited.get(geneName).add(ch); + } else { + // New gene found + visited.put(geneName, new ArrayList<>(Collections.singletonList(ch))); + } } - } - // Add cancer hotspot variant information - CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); - cancerHotspotVariant.setSampleCount(new HashMap<>()); + // Add cancer hotspot variant information + CancerHotspotVariant cancerHotspotVariant = new CancerHotspotVariant(); + cancerHotspotVariant.setSampleCount(new HashMap<>()); - String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); - cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); - cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); + String[] alternateCountSplit = currentRow.getCell(8).toString().split(":"); + cancerHotspotVariant.setAminoacidAlternate(alternateCountSplit[0]); + cancerHotspotVariant.setCount(Integer.parseInt(alternateCountSplit[1])); - String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); - for (String sampleCount : sampleSplit) { - String[] sampleCountSplit = sampleCount.split(":"); - cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + String[] sampleSplit = currentRow.getCell(38).toString().split("\\|"); + for (String sampleCount : sampleSplit) { + String[] sampleCountSplit = sampleCount.split(":"); + cancerHotspotVariant.getSampleCount().put(sampleCountSplit[0], Integer.parseInt(sampleCountSplit[1])); + } + ch.getVariants().add(cancerHotspotVariant); } - ch.getVariants().add(cancerHotspotVariant); } - fileInputStream.close(); for (String geneName : visited.keySet()) { rocksDbManager.update(rocksdb, geneName + CANCER_HOTSPOT_SUFFIX, visited.get(geneName)); } + + logger.info(PARSING_DONE_LOG_MESSAGE, cancerHotspot); } public List getCancerHotspot(String geneName) throws RocksDBException, IOException { @@ -422,29 +414,25 @@ public List getCancerHotspot(String geneName) throws RocksDBExcep return rocksDbManager.getCancerHotspot(rocksdb, key); } - protected void indexTSO500(Path tso500Path) throws IOException, RocksDBException { - // Gene Ref Seq - // FAS NM_000043 - // AR NM_000044 - logger.info("Indexing TSO500 data ..."); - - if (tso500Path != null && Files.exists(tso500Path) && Files.size(tso500Path) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); - } + logger.info(PARSING_LOG_MESSAGE, tso500Path); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(tso500Path)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // FAS NM_000043 + // AR NM_000044 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1] + TSO500_SUFFIX, "TSO500"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl TSO500 mapping file " + tso500Path + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, tso500Path); } public String getTSO500(String transcriptId) throws RocksDBException { @@ -456,29 +444,25 @@ public String getTSO500(String transcriptId) throws RocksDBException { return new String(bytes); } - protected void indexEGLHHaemOnc(Path eglhHaemOncPath) throws IOException, RocksDBException { - // Gene Ref Seq - // GNB1 NM_002074.4 - // CSF3R NM_000760.3 - logger.info("Indexing EGLH HaemOnc data ..."); - - if (eglhHaemOncPath != null && Files.exists(eglhHaemOncPath) && Files.size(eglhHaemOncPath) > 0) { - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { - String line = bufferedReader.readLine(); - while (StringUtils.isNotEmpty(line)) { - if (!line.startsWith("#")) { - String[] fields = line.split("\t", -1); - if (fields.length == 2) { - rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); - } + logger.info(PARSING_LOG_MESSAGE, eglhHaemOncPath); + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(eglhHaemOncPath)) { + String line = bufferedReader.readLine(); + // Gene Ref Seq + // GNB1 NM_002074.4 + // CSF3R NM_000760.3 + while (StringUtils.isNotEmpty(line)) { + if (!line.startsWith("#")) { + String[] fields = line.split("\t", -1); + if (fields.length == 2) { + rocksDbManager.update(rocksdb, fields[1].split("\\.")[0] + EGLH_HAEMONC_SUFFIX, "EGLH_HaemOnc"); } - line = bufferedReader.readLine(); } + line = bufferedReader.readLine(); } - } else { - logger.warn("Ensembl EGLH HaemOnc mapping file " + eglhHaemOncPath + " not found"); } + logger.info(PARSING_DONE_LOG_MESSAGE, eglhHaemOncPath); } public String getEGLHHaemOnc(String transcriptId) throws RocksDBException { @@ -510,4 +494,219 @@ protected void close() throws IOException { rocksDbManager.closeIndex(rocksdb, dbOption, dbLocation); } + protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); + + String currentGene = ""; + List drugs = new ArrayList<>(); + + try (BufferedReader br = FileUtils.newBufferedReader(geneDrugFile)) { + // Skip header + br.readLine(); + + int lineCounter = 1; + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split("\t"); + String geneName = parts[0]; + if (currentGene.equals("")) { + currentGene = geneName; + } else if (!currentGene.equals(geneName)) { + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + drugs = new ArrayList<>(); + currentGene = geneName; + } + + String source = null; + if (parts.length >= 4) { + source = parts[3]; + } + + String interactionType = null; + if (parts.length >= 5) { + interactionType = parts[4]; + } + + String drugName = null; + if (parts.length >= 8) { + // if drug name column is empty, use drug claim name instead + drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; + } + if (StringUtils.isEmpty(drugName)) { + // no drug name + continue; + } + + String chemblId = null; + if (parts.length >= 9) { + chemblId = parts[8]; + } + + List publications = new ArrayList<>(); + if (parts.length >= 10 && parts[9] != null) { + publications = Arrays.asList(parts[9].split(",")); + } + + GeneDrugInteraction drug = new GeneDrugInteraction( + geneName, drugName, source, null, null, interactionType, chemblId, publications); + drugs.add(drug); + lineCounter++; + } + } + // update last gene + rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); + + logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); + } + + protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { + + Map> geneDiseaseAssociationMap = new HashMap<>(50000); + + String line; + + // HPO +// logger.info(PARSING_LOG_MESSAGE, hpoFilePath); +// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { +// // Skip first header line +// bufferedReader.readLine(); +// while ((line = bufferedReader.readLine()) != null) { +// String[] fields = line.split("\t"); +// String omimId = fields[6]; +// String geneSymbol = fields[3]; +// String hpoId = fields[0]; +// String diseaseName = fields[1]; +// GeneTraitAssociation disease = +// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DATA); +// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); +// } +// } +// logger.info(PARSING_DONE_LOG_MESSAGE, hpoFilePath); + + // DisGeNet + logger.info(PARSING_LOG_MESSAGE, disgenetFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { + // Skip first header line + bufferedReader.readLine(); + while ((line = bufferedReader.readLine()) != null) { + String[] fields = line.split("\t"); + String diseaseId = fields[4]; + String diseaseName = fields[5]; + String score = fields[9]; + String numberOfPubmeds = fields[13].trim(); + String numberOfSNPs = fields[14]; + String source = fields[15]; + GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), + Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), DISGENET_DATA); + addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, disgenetFilePath); + + for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); + } + } + + protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException, CellBaseException { + logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile); + + try (BufferedReader reader = Files.newBufferedReader(miRTarBaseFile)) { + String line; + // Skip header line + reader.readLine(); + + String currentMiRTarBaseId = null; + String currentMiRNA = null; + String currentGene = null; + List targetGenes = new ArrayList<>(); + Map> geneToMirna = new HashMap<>(); + + while ((line = reader.readLine()) != null) { + String[] field = line.split("\t", -1); + if (field.length != 9) { + throw new CellBaseException("Invalid number of columns " + field.length + " (expected 9 columns) parsing file " + + miRTarBaseFile + ". Line: " + line); + } + + // #0: miRTarBase ID + String miRTarBaseId = field[0]; + if (currentMiRTarBaseId == null) { + currentMiRTarBaseId = miRTarBaseId; + } + + // #1: miRNA + String miRNA = field[1]; + if (currentMiRNA == null) { + currentMiRNA = miRNA; + } + + // #2: Species (miRNA) + + // #3: Target Gene + String geneName = field[3]; + if (currentGene == null) { + currentGene = geneName; + } + + // #4: Target Gene (Entrez ID) + // #5: Species (Target Gene) + + if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { + // new entry, store current one + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + targetGenes = new ArrayList<>(); + currentGene = geneName; + currentMiRTarBaseId = miRTarBaseId; + currentMiRNA = miRNA; + } + + // #6: Experiments + String experiment = field[6]; + + // #7: Support Type + String supportType = field[7]; + + // #8: pubmed + String pubmed = field[8]; + + targetGenes.add(new TargetGene(experiment, supportType, pubmed)); + } + + // parse last entry + MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes); + addValueToMapElement(geneToMirna, currentGene, miRnaTarget); + + for (Map.Entry> entry : geneToMirna.entrySet()) { + rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile); + } + + protected static void addValueToMapElement(Map> map, String key, T value) { + if (map.containsKey(key)) { + map.get(key).add(value); + } else { + List valueList = new ArrayList<>(); + valueList.add(value); + map.put(key, valueList); + } + } + + protected List getDrugs(String id) throws RocksDBException, IOException { + String key = id + DRUGS_SUFFIX; + return rocksDbManager.getDrugs(rocksdb, key); + } + + protected List getDiseases(String id) throws RocksDBException, IOException { + String key = id + DISEASE_SUFFIX; + return rocksDbManager.getDiseases(rocksdb, key); + } + + protected List getMirnaTargets(String geneName) throws RocksDBException, IOException { + String key = geneName + MIRTARBASE_SUFFIX; + return rocksDbManager.getMirnaTargets(rocksdb, key); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..b14d20b54c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -19,60 +19,70 @@ import org.opencb.biodata.formats.obo.OboParser; import org.opencb.biodata.models.core.OntologyTerm; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class OntologyBuilder extends CellBaseBuilder { - private Path hpoFile; - private Path goFile; - private Path doidFile; - private Path mondoFile; + private Path oboDownloadPath; - public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { + public OntologyBuilder(Path oboDownloadPath, CellBaseSerializer serializer) { super(serializer); - hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); - goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); - doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); + this.oboDownloadPath = oboDownloadPath; } @Override public void parse() throws Exception { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFile); - OboParser parser = new OboParser(); - List terms = parser.parseOBO(bufferedReader, "Human Phenotype Ontology"); - for (OntologyTerm term : terms) { - term.setSource("HP"); - serializer.serialize(term); - } + logger.info(BUILDING_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); - bufferedReader = FileUtils.newBufferedReader(goFile); - terms = parser.parseOBO(bufferedReader, "Gene Ontology"); - for (OntologyTerm term : terms) { - term.setSource("GO"); - serializer.serialize(term); - } + // Sanity check + checkDirectory(oboDownloadPath, getDataName(REGULATION_DATA)); - bufferedReader = FileUtils.newBufferedReader(doidFile); - terms = parser.parseOBO(bufferedReader, "Human Disease Ontology"); - for (OntologyTerm term : terms) { - term.setSource("DOID"); - serializer.serialize(term); - } + // Check ontology files + List hpoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(HPO_OBO_DATA)), getDataName(HPO_OBO_DATA)); + List goFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(GO_OBO_DATA)), getDataName(GO_OBO_DATA)); + List doidFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(DOID_OBO_DATA)), getDataName(DOID_OBO_DATA)); + List mondoFiles = checkOboFiles(oboDownloadPath.resolve(getDataVersionFilename(MONDO_OBO_DATA)), getDataName(MONDO_OBO_DATA)); - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); - } + // Parse OBO files and build + parseOboFile(hpoFiles.get(0), HPO_OBO_DATA); + parseOboFile(goFiles.get(0), GO_OBO_DATA); + parseOboFile(doidFiles.get(0), DOID_OBO_DATA); + parseOboFile(mondoFiles.get(0), MONDO_OBO_DATA); + // Close serializer serializer.close(); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(ONTOLOGY_DATA)); + } + + private void parseOboFile(File oboFile, String data) throws IOException { + logger.info(PARSING_LOG_MESSAGE, oboFile); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(oboFile.toPath())) { + OboParser parser = new OboParser(); + List terms = parser.parseOBO(bufferedReader, data); + for (OntologyTerm term : terms) { + serializer.serialize(term); + } + } + logger.info(PARSING_DONE_LOG_MESSAGE, oboFile); + } + + private List checkOboFiles(Path versionFilePath, String name) throws IOException, CellBaseException { + List files = checkFiles(dataSourceReader.readValue(versionFilePath.toFile()), oboDownloadPath, getDataName(ONTOLOGY_DATA) + + "/" + name); + if (files.size() != 1) { + throw new CellBaseException("One " + name + " file is expected, but currently there are " + files.size() + " files"); + } + return files; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java index 1f7a4836ca..1a0ba2e7d3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -23,13 +23,17 @@ import org.opencb.biodata.models.core.Xref; import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.pharma.guideline.BasicObject; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; @@ -37,8 +41,7 @@ public class PharmGKBBuilder extends CellBaseBuilder { - private final Path inputDir; - private final Path pharmGKBDir; + private final Path pharmGkbDownloadPath; private static final String CHEMICALS_BASENAME = "chemicals"; private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv"; @@ -88,21 +91,25 @@ public class PharmGKBBuilder extends CellBaseBuilder { private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE"; private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP"; - public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) { + public PharmGKBBuilder(Path parmGkbDownloadPath, CellBaseFileSerializer serializer) { super(serializer); - - this.inputDir = inputDir; - this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA); + this.pharmGkbDownloadPath = parmGkbDownloadPath; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkDirectory(inputDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); + + // Sanity check + checkDirectory(pharmGkbDownloadPath, getDataName(PHARMGKB_DATA)); - // PharmGKB - FileUtils.checkDirectory(pharmGKBDir); - logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME); + // Check PharmGKB files + DataSource dataSource = dataSourceReader.readValue(pharmGkbDownloadPath.resolve(getDataVersionFilename(PHARMGKB_DATA)).toFile()); + List pharmGkbFiles = checkFiles(dataSource, pharmGkbDownloadPath, getDataCategory(PHARMGKB_DATA) + "/" + + getDataName(PHARMGKB_DATA)); + + // Unzip downloaded file + unzipDownloadedFiles(pharmGkbFiles); // Parse chemical file Map chemicalsMap = parseChemicalFile(); @@ -113,8 +120,6 @@ public void parse() throws Exception { // Parse gene file parseGeneFile(chemicalsMap); - logger.info("Parsing {} files finished.", PHARMGKB_NAME); - // Generation the pharmacogenomics JSON file logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir()); int counter = 0; @@ -125,11 +130,14 @@ public void parse() throws Exception { } } serializer.close(); - logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PHARMGKB_DATA)); } private Map parseChemicalFile() throws IOException { - Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + Path chemicalsFile = serializer.getOutdir().resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, chemicalsFile); + Map chemicalsMap = new HashMap<>(); try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) { // Skip first line, i.e. the header line @@ -146,7 +154,7 @@ private Map parseChemicalFile() throws IOException { // Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers PharmaChemical pharmaChemical = new PharmaChemical() .setId(fields[0]) - .setSource(PHARMGKB_NAME) + .setSource(PHARMGKB_DATA) .setName(fields[1]) .setSmiles(fields[7]) .setInChI(fields[8]); @@ -177,6 +185,7 @@ private Map parseChemicalFile() throws IOException { } logger.info("Number of Chemical items read {}", chemicalsMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, chemicalsFile); return chemicalsMap; } @@ -192,8 +201,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM Map> variantMap = parseVariantFile(); // clinical_annotations.tsv - try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME) - .resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) { + Path clinAnnotPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, clinAnnotPath); + try (BufferedReader br = FileUtils.newBufferedReader(clinAnnotPath)) { // Skip first line, i.e. the header line String line = br.readLine(); while ((line = br.readLine()) != null) { @@ -278,6 +288,7 @@ private void parseClinicalAnnotationFiles(Map chemicalsM } } } + logger.info(PARSING_DONE_LOG_MESSAGE, clinAnnotPath); // Update the clinical annotation map by parsing the clinical annotation evidences parseClinicalAnnotationEvidenceFile(variantAnnotationMap); @@ -300,7 +311,9 @@ private void parseClinicalAnnotationFiles(Map chemicalsM private Map> parseVariantFile() throws IOException { Map> variantMap = new HashMap<>(); // Parse the variant file (i.e., variants.tsv) - Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + Path varPath = serializer.getOutdir().resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varPath); + try (BufferedReader br = FileUtils.newBufferedReader(varPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -367,6 +380,7 @@ private Map> parseVariantFile() throws IOException { } logger.info("Number of variants = {}", variantMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, varPath); return variantMap; } @@ -385,7 +399,8 @@ private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException { // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) - Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + Path allelesPath = serializer.getOutdir().resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, allelesPath); try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -502,12 +522,14 @@ private void parseClinicalAnnotationAlleleFile(Map variantAssociationMap) throws IOException { // For CellBase, variant association corresponds to PharmGKB variant annotation // Parse the variant annotation file (i.e., var_drug_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -562,6 +584,7 @@ private void parseVariantAnnotationFile(Map va } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private Map parseGuidelineAnnotationFiles() throws IOException { @@ -571,7 +594,7 @@ private Map parseGuidelineAnnotationFiles() t ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class); // Parse the guideline annotations JSON files - Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME); + Path guidelinesPath = serializer.getOutdir().resolve(GUIDELINE_ANNOTATIONS_BASENAME); FileUtils.checkDirectory(guidelinesPath); for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) { if (file.getName().endsWith("json")) { @@ -593,7 +616,8 @@ private Map parseGuidelineAnnotationFiles() t private Map parseDrugLabelAnnotationFile() throws IOException { Map drugLabelAnnotationMap = new HashMap<>(); // Parse the drug labels annotations file (i.e., drugLabels.tsv) - Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + Path drugLabelPath = serializer.getOutdir().resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, drugLabelPath); try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -631,12 +655,15 @@ private Map parseDrugLabelAnnotationFile() th } logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, drugLabelPath); return drugLabelAnnotationMap; } private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_pheno_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); + int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -691,11 +718,13 @@ private void parsePhenotypeAnnotationFile(Map } } logger.info("Number of phenotype annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException { // Parse the variant annotation file (i.e., var_fa_ann.tsv) - Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + Path varDrugPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, varDrugPath); int counter = 0; try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { // Skip first line, i.e. the header line @@ -751,12 +780,14 @@ private void parseFunctionalAnnotationFile(Map } } logger.info("Number of variant annotations = {}", counter); + logger.info(PARSING_DONE_LOG_MESSAGE, varDrugPath); } private void parseStudyParameterFile(Map variantAssociationMap) throws IOException { Map> studyParametersMap = new HashMap<>(); // Parse the study parameters file (i.e., study_parameters.tsv) - Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + Path studyParamsPath = serializer.getOutdir().resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, studyParamsPath); try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -807,6 +838,7 @@ private void parseStudyParameterFile(Map varia } } logger.info("Number of study parameters lines = {}", studyParametersMap.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, studyParamsPath); for (Map.Entry> entry : studyParametersMap.entrySet()) { if (variantAssociationMap.containsKey(entry.getKey())) { @@ -861,7 +893,8 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx // Parse the genes file (i.e., genes.tsv) Map geneAnnotationMapByPgkbGeneId = new HashMap<>(); - Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + Path genesPath = serializer.getOutdir().resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, genesPath); try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -940,13 +973,15 @@ private void parseGeneFile(Map chemicalsMap) throws IOEx } logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size()); + logger.info(PARSING_DONE_LOG_MESSAGE, genesPath); } private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName, Map geneAnnotationMapByPgkbGeneId) throws IOException { int counter = 0; // Parse the genes file (i.e., relationships.tsv) - Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + Path relationshipsPath = serializer.getOutdir().resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + logger.info(PARSING_LOG_MESSAGE, relationshipsPath); try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) { // Skip first line, i.e. the header line String line = br.readLine(); @@ -986,6 +1021,7 @@ private void parseChemicalGeneRelationships(Map> pgkbGeneIdM } } logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter); + logger.info(PARSING_DONE_LOG_MESSAGE, relationshipsPath); } private List stringFieldToList(String field) { @@ -1011,6 +1047,29 @@ private boolean isHaplotype(String value) { } private List getHaplotypeList(String value) { - return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList()); + return Arrays.stream(value.split(",")).map(String::trim).collect(Collectors.toList()); + } + + private void unzipDownloadedFiles(List pharmGkbFiles) throws CellBaseException { + // Unzip + for (File pharmGgkFile : pharmGkbFiles) { + logger.info("Unzip file: {}", pharmGgkFile); + try { + String outPath = serializer.getOutdir().resolve(pharmGgkFile.getName().split("\\.")[0]).toString(); + List params = Arrays.asList("-d", outPath, "-o", pharmGgkFile.toString()); + EtlCommons.runCommandLineProcess(null, "unzip", params, Paths.get(outPath + ".log").toString()); + } catch (CellBaseException e) { + if (pharmGgkFile.getName().contains(GUIDELINE_ANNOTATIONS_BASENAME)) { + // It fails because of long filenames, so it does not raise any exception + logger.warn(e.getMessage()); + } + } catch (IOException e) { + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing unzip in file " + pharmGgkFile, e); + } + } } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java new file mode 100644 index 0000000000..9e326013fc --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PolygenicScoreBuilder.java @@ -0,0 +1,569 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.pgs.CommonPolygenicScore; +import org.opencb.biodata.models.core.pgs.PgsCohort; +import org.opencb.biodata.models.core.pgs.PolygenicScore; +import org.opencb.biodata.models.core.pgs.VariantPolygenicScore; +import org.opencb.biodata.models.variant.avro.OntologyTermAnnotation; +import org.opencb.biodata.models.variant.avro.PubmedReference; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.commons.utils.FileUtils; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; +import org.rocksdb.RocksIterator; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +public class PolygenicScoreBuilder extends CellBaseBuilder { + + private String source; + private String version; + + private Path pgsDir; + private CellBaseFileSerializer fileSerializer; + + protected Map rdbConnectionPerChrom = new HashMap<>(); + + protected static ObjectMapper mapper; + protected static ObjectReader varPgsReader; + protected static ObjectWriter jsonObjectWriter; + + public static final String COMMON_POLYGENIC_SCORE_FILENAME = "common_polygenic_score.json.gz"; + public static final String VARIANT_POLYGENIC_SCORE_FILENAME = "variant_polygenic_score.json.gz"; + + private static final String RSID_COL = "rsID"; + private static final String CHR_NAME_COL = "chr_name"; + private static final String EFFECT_ALLELE_COL = "effect_allele"; + private static final String OTHER_ALLELE_COL = "other_allele"; + private static final String EFFECT_WEIGHT_COL = "effect_weight"; + private static final String ALLELEFREQUENCY_EFFECT_COL = "allelefrequency_effect"; + private static final String ODDS_RATIO_COL = "OR"; + private static final String HAZARD_RATIO_COL = "HR"; + private static final String LOCUS_NAME_COL = "locus_name"; + private static final String IS_HAPLOTYPE_COL = "is_haplotype"; + private static final String IS_DIPLOTYPE_COL = "is_diplotype"; + private static final String IMPUTATION_METHOD_COL = "imputation_method"; + private static final String VARIANT_DESCRIPTION_COL = "variant_description"; + private static final String INCLUSION_CRITERIA_COL = "inclusion_criteria"; + private static final String IS_INTERACTION_COL = "is_interaction"; + private static final String IS_DOMINANT_COL = "is_dominant"; + private static final String IS_RECESSIVE_COL = "is_recessive"; + private static final String DOSAGE_0_WEIGHT_COL = "dosage_0_weight"; + private static final String DOSAGE_1_WEIGHT_COL = "dosage_1_weight"; + private static final String DOSAGE_2_WEIGHT_COL = "dosage_2_weight"; + private static final String HM_RSID_COL = "hm_rsID"; + private static final String HM_CHR_COL = "hm_chr"; + private static final String HM_POS_COL = "hm_pos"; + private static final String HM_INFEROTHERALLELE_COL = "hm_inferOtherAllele"; + + public static final String SAMPLE_SET_KEY = "Sample Set"; + public static final String ODDS_RATIO_KEY = "Odds ratio"; + public static final String HAZARD_RATIO_KEY = "Hazard ratio"; + public static final String BETA_KEY = "Beta"; + public static final String AUROC_KEY = "AUROC"; // Area Under the Receiver-Operating Characteristic Curve (AUROC) + public static final String CINDEX_KEY = "C-index"; // Concordance Statistic (C-index) + public static final String OTHER_KEY = "Other metric"; + private static final String EFFECT_WEIGHT_KEY = "Effect weight"; + private static final String ALLELE_FREQUENCY_EFFECT_KEY = "Allele frequency effect"; + private static final String LOCUS_NAME_KEY = "Locus name"; + private static final String IS_HAPLOTYPE_KEY = "Haplotype"; + private static final String IS_DIPLOTYPE_KEY = "Diplotype"; + private static final String IMPUTATION_METHOD_KEY = "Imputation method"; + private static final String VARIANT_DESCRIPTION_KEY = "Variant description"; + private static final String INCLUSION_CRITERIA_KEY = "Score inclusion criteria"; + private static final String IS_INTERACTION_KEY = "Interaction"; + private static final String IS_DOMINANT_KEY = "Dominant inheritance model"; + private static final String IS_RECESSIVE_KEY = "Recessive inheritance model"; + private static final String DOSAGE_0_WEIGHT_KEY = "Effect weight with 0 copy of the effect allele"; + private static final String DOSAGE_1_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + private static final String DOSAGE_2_WEIGHT_KEY = "Effect weight with 1 copy of the effect allele"; + + private static final Set VALID_CHROMOSOMES = new HashSet<>(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", "M")); + + static { + mapper = new ObjectMapper(); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + varPgsReader = mapper.readerFor(VariantPolygenicScore.class); + jsonObjectWriter = mapper.writer(); + } + + public PolygenicScoreBuilder(String source, String version, Path pgsDir, CellBaseFileSerializer serializer) { + super(serializer); + + this.source = source; + this.version = version; + + this.fileSerializer = serializer; + this.pgsDir = pgsDir; + + logger = LoggerFactory.getLogger(PolygenicScoreBuilder.class); + } + + @Override + public void parse() throws Exception { + // Check input folder + FileUtils.checkPath(pgsDir); + + logger.info("Parsing polygenic score (PGS) files..."); + + BufferedWriter bw = FileUtils.newBufferedWriter(serializer.getOutdir().resolve(COMMON_POLYGENIC_SCORE_FILENAME)); + + for (File file : pgsDir.toFile().listFiles()) { + if (file.isFile()) { + if (file.getName().endsWith(".txt.gz")) { + logger.info("Processing PGS file: {}", file.getName()); + + String pgsId = null; + Map columnPos = new HashMap<>(); + + BufferedReader br = FileUtils.newBufferedReader(file.toPath()); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + if (line.startsWith("#pgs_id=")) { + pgsId = line.split("=")[1].trim(); + // Sanity check + if (!file.getName().startsWith(pgsId)) { + throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id mismatch"); + } + } + } else if (line.startsWith(RSID_COL) || line.startsWith(CHR_NAME_COL)) { + String[] fields = line.split("\t"); + for (int i = 0; i < fields.length; i++) { + columnPos.put(fields[i], i); + } + } else { + // Sanity check + if (pgsId == null) { + throw new CellBaseException("Error parsing file " + file.getName() + ": pgs_id is null"); + } + saveVariantPolygenicScore(line, columnPos, pgsId); + } + } + br.close(); + } else if (file.getName().endsWith("_metadata.tar.gz")) { + processPgsMetadataFile(file, bw); + } + } + } + + // Serialize/write the saved variant polygenic scores in the RocksDB + serializeRDB(); + serializer.close(); + + // Close PGS file (with common attributes) + bw.close(); + + logger.info("Parsing PGS files finished."); + } + + private void processPgsMetadataFile(File metadataFile, BufferedWriter bw) throws IOException, CellBaseException { + String pgsId = metadataFile.getName().split("_")[0]; + + Path tmp = pgsDir.resolve("tmp"); + if (!tmp.toFile().exists()) { + tmp.toFile().mkdirs(); + } + + String command = "tar -xzf " + metadataFile.getAbsolutePath() + " -C " + tmp.toAbsolutePath(); + Process process = Runtime.getRuntime().exec(command); + + // Wait for the process to complete + int exitCode; + try { + exitCode = process.waitFor(); + } catch (InterruptedException e) { + throw new IOException("Error waiting for the process to complete.", e); + } + + // Check the exit code + if (exitCode != 0) { + throw new IOException("Error executing the command. Exit code: " + exitCode); + } + + // Create PGS object, with the common fields + CommonPolygenicScore pgs = new CommonPolygenicScore(); + pgs.setId(pgsId); + pgs.setSource(source); + pgs.setVersion(version); + + String line; + String[] field; + BufferedReader br; + // PGSxxxxx_metadata_publications.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_publications.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 5 6 + // PGS Publication/Study (PGP) ID First Author Title Journal Name Publication Date Release Date Authors + // 7 8 + // digital object identifier (doi) PubMed ID (PMID) + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getPubmedRefs().add(new PubmedReference(strings.get(8), strings.get(2), strings.get(3), strings.get(4), null)); + } + + // PGSxxxxx_metadata_efo_traits.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_efo_traits.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 + // Ontology Trait ID Ontology Trait Label Ontology Trait Description Ontology URL + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getTraits().add(new OntologyTermAnnotation(strings.get(0), strings.get(1), strings.get(2), "EFO", strings.get(3), + new HashMap<>())); + } + + // PGSxxxxx_metadata_scores.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_scores.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // Polygenic Score (PGS) ID PGS Name Reported Trait Mapped Trait(s) (EFO label) Mapped Trait(s) (EFO ID) + // 5 6 7 8 + // PGS Development Method PGS Development Details/Relevant Parameters Original Genome Build Number of Variants + // 9 10 11 12 13 + // Number of Interaction Terms Type of Variant Weight PGS Publication (PGP) ID Publication (PMID) Publication (doi) + // 14 15 + // Score and results match the original publication Ancestry Distribution (%) - Source of Variant Associations (GWAS) + // 16 17 18 19 + // Ancestry Distribution (%) - Score Development/Training Ancestry Distribution (%) - PGS Evaluation FTP link Release Date + // 19 + // License/Terms of Use + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + // Sanity check + if (!pgsId.equals(strings.get(0))) { + throw new CellBaseException("Mismatch PGS ID when parsing file " + pgsId + "_metadata_scores.csv"); + } + if (StringUtils.isNotEmpty(pgs.getName())) { + throw new CellBaseException("More than one PGS in file " + pgsId + "_metadata_scores.csv"); + } + pgs.setName(strings.get(1)); + } + + // TODO: PGSxxxxx_metadata_score_development_samples.csv + // 0 1 2 3 4 + // Polygenic Score (PGS) ID Stage of PGS Development Number of Individuals Number of Cases Number of Controls + // 5 6 7 8 + // Percent of Participants Who are Male Sample Age Broad Ancestry Category "Ancestry (e.g. French, Chinese)" + // 9 10 11 12 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 13 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_performance_metrics.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_performance_metrics.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 3 4 + // PGS Performance Metric (PPM) ID Evaluated Score PGS Sample Set (PSS) PGS Publication (PGP) ID Reported Trait + // 5 6 7 8 + // Covariates Included in the Model PGS Performance: Other Relevant Information Publication (PMID) Publication (doi) + // 9 10 11 12 + // Hazard Ratio (HR) Odds Ratio (OR) Beta Area Under the Receiver-Operating Characteristic Curve (AUROC) + // 13 14 + // Concordance Statistic (C-index) Other Metric(s) + + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + + // Sanity check + if (!pgsId.equals(strings.get(1))) { + continue; + } + + Map values = new HashMap<>(); + if (StringUtils.isNotEmpty(strings.get(2))) { + values.put(SAMPLE_SET_KEY, strings.get(2)); + } + if (StringUtils.isNotEmpty(strings.get(9))) { + values.put(HAZARD_RATIO_KEY, strings.get(9)); + } + if (StringUtils.isNotEmpty(strings.get(10))) { + values.put(ODDS_RATIO_KEY, strings.get(10)); + } + if (StringUtils.isNotEmpty(strings.get(11))) { + values.put(BETA_KEY, strings.get(11)); + } + if (StringUtils.isNotEmpty(strings.get(12))) { + values.put(AUROC_KEY, strings.get(12)); + } + if (StringUtils.isNotEmpty(strings.get(13))) { + values.put(CINDEX_KEY, strings.get(13)); + } + if (StringUtils.isNotEmpty(strings.get(14))) { + values.put(OTHER_KEY, strings.get(14)); + } + pgs.getValues().add(values); + } + + // TODO: PGSxxxxx_metadata_evaluation_sample_sets.csv + // 0 1 2 3 4 + // PGS Sample Set (PSS) Polygenic Score (PGS) ID Number of Individuals Number of Cases Number of Controls + // 5 6 7 + // Percent of Participants Who are Male Sample Age,Broad Ancestry Category "Ancestry (e.g.French, Chinese)" + // 8 9 10 11 + // Country of Recruitment Additional Ancestry Description Phenotype Definitions and Methods Followup Time + // 12 13 14 15 16 + // GWAS Catalog Study ID (GCST...) Source PubMed ID (PMID) Source DOI Cohort(s) Additional Sample/Cohort Information + + // PGSxxxxx_metadata_cohorts.csv + br = FileUtils.newBufferedReader(tmp.resolve(pgsId + "_metadata_cohorts.csv")); + // Skip first line + line = br.readLine(); + while ((line = br.readLine()) != null) { + // 0 1 2 + // Cohort ID Cohort Name Previous/other/additional names + StringReader stringReader = new StringReader(line); + CSVParser csvParser = CSVFormat.DEFAULT.parse(stringReader); + CSVRecord strings = csvParser.getRecords().get(0); + pgs.getCohorts().add(new PgsCohort(strings.get(0), strings.get(1), strings.get(2))); + } + + // Create PGS object, with the common fields + bw.write(jsonObjectWriter.writeValueAsString(pgs)); + bw.write("\n"); + + // Clean tmp folder + for (File tmpFile : tmp.toFile().listFiles()) { + tmpFile.delete(); + } + } + + private void saveVariantPolygenicScore(String line, Map columnPos, String pgsId) + throws RocksDBException, IOException { + String chrom; + int position; + String effectAllele; + String otherAllele; + + String[] field = line.split("\t", -1); + + if (columnPos.containsKey(HM_CHR_COL)) { + chrom = field[columnPos.get(HM_CHR_COL)]; + if (!VALID_CHROMOSOMES.contains(chrom)) { + // Only chromosomes are processed; no contigs, e.g.: 8_KI270821v1_alt, 11_KI270927v1_alt, 12_GL877875v1_alt,... + return; + } + } else { + logger.warn("Missing field '{}', skipping line: {}", HM_CHR_COL, line); + return; + } + if (columnPos.containsKey(HM_POS_COL)) { + try { + position = Integer.parseInt(field[columnPos.get(HM_POS_COL)]); + } catch (NumberFormatException e) { + logger.warn("Invalid field '{}' (value = {}), skipping line: {}", HM_POS_COL, field[columnPos.get(HM_POS_COL)], line); + return; + } + } else { + logger.warn("Missing field '{}', skipping line: {}", HM_POS_COL, line); + return; + } + if (columnPos.containsKey(EFFECT_ALLELE_COL)) { + effectAllele = field[columnPos.get(EFFECT_ALLELE_COL)]; + } else { + logger.warn("Missing field '{}', skipping line: {}", EFFECT_ALLELE_COL, line); + return; + } + if (columnPos.containsKey(HM_INFEROTHERALLELE_COL) && StringUtils.isNotEmpty(field[columnPos.get(HM_INFEROTHERALLELE_COL)])) { + otherAllele = field[columnPos.get(HM_INFEROTHERALLELE_COL)]; + } else if (columnPos.containsKey(OTHER_ALLELE_COL)) { + otherAllele = field[columnPos.get(OTHER_ALLELE_COL)]; + } else { + logger.warn("Missing fields '{}' and '{}' (at least one is mandatory), skipping line: {}", HM_INFEROTHERALLELE_COL, + OTHER_ALLELE_COL, line); + return; + } + + // Create polygenic score + Map values = new HashMap<>(); + if (columnPos.containsKey(EFFECT_WEIGHT_COL)) { + values.put(EFFECT_WEIGHT_KEY, field[columnPos.get(EFFECT_WEIGHT_COL)]); + } + if (columnPos.containsKey(ALLELEFREQUENCY_EFFECT_COL)) { + values.put(ALLELE_FREQUENCY_EFFECT_KEY, field[columnPos.get(ALLELEFREQUENCY_EFFECT_COL)]); + } + if (columnPos.containsKey(ODDS_RATIO_COL)) { + values.put(ODDS_RATIO_KEY, field[columnPos.get(ODDS_RATIO_COL)]); + } + if (columnPos.containsKey(HAZARD_RATIO_COL)) { + values.put(HAZARD_RATIO_KEY, field[columnPos.get(HAZARD_RATIO_COL)]); + } + if (columnPos.containsKey(LOCUS_NAME_COL)) { + values.put(LOCUS_NAME_KEY, field[columnPos.get(LOCUS_NAME_COL)]); + } + if (columnPos.containsKey(IS_HAPLOTYPE_COL)) { + values.put(IS_HAPLOTYPE_KEY, field[columnPos.get(IS_HAPLOTYPE_COL)]); + } + if (columnPos.containsKey(IS_DIPLOTYPE_COL)) { + values.put(IS_DIPLOTYPE_KEY, field[columnPos.get(IS_DIPLOTYPE_COL)]); + } + if (columnPos.containsKey(IMPUTATION_METHOD_COL)) { + values.put(IMPUTATION_METHOD_KEY, field[columnPos.get(IMPUTATION_METHOD_COL)]); + } + if (columnPos.containsKey(VARIANT_DESCRIPTION_COL)) { + values.put(VARIANT_DESCRIPTION_KEY, field[columnPos.get(VARIANT_DESCRIPTION_COL)]); + } + if (columnPos.containsKey(INCLUSION_CRITERIA_COL)) { + values.put(INCLUSION_CRITERIA_KEY, field[columnPos.get(INCLUSION_CRITERIA_COL)]); + } + if (columnPos.containsKey(IS_INTERACTION_COL)) { + values.put(IS_INTERACTION_KEY, field[columnPos.get(IS_INTERACTION_COL)]); + } + if (columnPos.containsKey(IS_DOMINANT_COL)) { + values.put(IS_DOMINANT_KEY, field[columnPos.get(IS_DOMINANT_COL)]); + } + if (columnPos.containsKey(IS_RECESSIVE_COL)) { + values.put(IS_RECESSIVE_KEY, field[columnPos.get(IS_RECESSIVE_COL)]); + } + if (columnPos.containsKey(DOSAGE_0_WEIGHT_COL)) { + values.put(DOSAGE_0_WEIGHT_KEY, field[columnPos.get(DOSAGE_0_WEIGHT_COL)]); + } + if (columnPos.containsKey(DOSAGE_1_WEIGHT_COL)) { + values.put(DOSAGE_1_WEIGHT_KEY, field[columnPos.get(DOSAGE_1_WEIGHT_COL)]); + } + if (columnPos.containsKey(DOSAGE_2_WEIGHT_COL)) { + values.put(DOSAGE_2_WEIGHT_KEY, field[columnPos.get(DOSAGE_2_WEIGHT_COL)]); + } + + // Creating and/or updating variant polygenic score + VariantPolygenicScore varPgs; + RocksDB rdb = getRocksDB(chrom); + String key = chrom + ":" + position + ":" + otherAllele + ":" + effectAllele; + byte[] dbContent = rdb.get(key.getBytes()); + if (dbContent == null) { + varPgs = new VariantPolygenicScore(chrom, position, otherAllele, effectAllele, + Collections.singletonList(new PolygenicScore(pgsId, values))); + } else { + varPgs = varPgsReader.readValue(dbContent); + varPgs.getPolygenicScores().add(new PolygenicScore(pgsId, values)); + } + rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(varPgs)); + } + + private void serializeRDB() throws IOException { + for (Map.Entry entry : rdbConnectionPerChrom.entrySet()) { + RocksDB rdb = (RocksDB) entry.getValue()[0]; + Options dbOption = (Options) entry.getValue()[1]; + String dbLocation = (String) entry.getValue()[2]; + + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + logger.info("Reading from RocksDB index ({}) and serializing to {}.json.gz", dbLocation, + serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + VariantPolygenicScore varPgs = varPgsReader.readValue(rocksIterator.value()); + serializer.serialize(varPgs); + counter++; + if (counter % 10000 == 0) { + logger.info("{} written", counter); + } + } + closeIndex(rdb, dbOption, dbLocation); + } + } + + private void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { + if (rdb != null) { + rdb.close(); + } + if (dbOption != null) { + dbOption.dispose(); + } + if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { + org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); + } + } + + private Object[] getDBConnection(String dbLocation, boolean forceCreate) { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + Options options = new Options().setCreateIfMissing(true); + +// options.setMaxBackgroundCompactions(4); +// options.setMaxBackgroundFlushes(1); +// options.setCompressionType(CompressionType.NO_COMPRESSION); +// options.setMaxOpenFiles(-1); +// options.setIncreaseParallelism(4); +// options.setCompactionStyle(CompactionStyle.LEVEL); +// options.setLevelCompactionDynamicLevelBytes(true); + + RocksDB db = null; + try { + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + // do something + } catch (RocksDBException e) { + // do some error handling + e.printStackTrace(); + System.exit(1); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + } + + private Object[] getRocksDBConnection(String chrom) { + if (!rdbConnectionPerChrom.containsKey(chrom) || rdbConnectionPerChrom.get(chrom) == null) { + Object[] dbConnection = getDBConnection(pgsDir.resolve("rdb-" + chrom + ".idx").toString(), true); + rdbConnectionPerChrom.put(chrom, dbConnection); + } + return rdbConnectionPerChrom.get(chrom); + } + + private RocksDB getRocksDB(String chrom) { + return (RocksDB) getRocksDBConnection(chrom)[0]; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java index 0369a0e6aa..d8246241e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ProteinBuilder.java @@ -21,6 +21,8 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.protein.uniprot.UniProtParser; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.*; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; @@ -34,54 +36,71 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.math.BigInteger; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class ProteinBuilder extends CellBaseBuilder { - private Path uniprotFilesDir; - private Path interproFilePath; + private Path proteinPath; private String species; - private Map proteinMap; - protected Logger logger = LoggerFactory.getLogger(this.getClass()); - public ProteinBuilder(Path uniprotFilesDir, String species, CellBaseSerializer serializer) { - this(uniprotFilesDir, null, species, serializer); - } - - public ProteinBuilder(Path uniprotFilesDir, Path interproFilePath, String species, CellBaseSerializer serializer) { + public ProteinBuilder(Path proteinPath, String species, CellBaseSerializer serializer) { super(serializer); - this.uniprotFilesDir = uniprotFilesDir; - this.interproFilePath = interproFilePath; + this.proteinPath = proteinPath; this.species = species; } @Override - public void parse() throws IOException { + public void parse() throws CellBaseException, IOException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(PROTEIN_DATA)); + + // Sanity check + checkDirectory(proteinPath, getDataName(PROTEIN_DATA)); + + // Check UniProt file + DataSource dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(UNIPROT_DATA)).toFile()); + List uniProtFiles = checkFiles(dataSource, proteinPath, getDataCategory(UNIPROT_DATA) + "/" + getDataName(UNIPROT_DATA)); + if (uniProtFiles.size() != 1) { + throw new CellBaseException("Only one " + getDataName(UNIPROT_DATA) + " file is expected, but currently there are " + + uniProtFiles.size() + " files"); + } - if (uniprotFilesDir == null || !Files.exists(uniprotFilesDir)) { - throw new IOException("File '" + uniprotFilesDir + "' not valid"); + // Check InterPro file + dataSource = dataSourceReader.readValue(proteinPath.resolve(getDataVersionFilename(INTERPRO_DATA)).toFile()); + List interProFiles = checkFiles(dataSource, proteinPath, getDataCategory(INTERPRO_DATA) + "/" + getDataName(INTERPRO_DATA)); + if (interProFiles.size() != 1) { + throw new CellBaseException("Only one " + getDataName(INTERPRO_DATA) + " file is expected, but currently there are " + + interProFiles.size() + " files"); } - RocksDB rocksDb = getDBConnection(); + // Prepare UniProt data by splitting data in chunks + Path uniProtChunksPath = serializer.getOutdir().resolve(UNIPROT_CHUNKS_SUBDIRECTORY); + logger.info("Split {} file {} into chunks at {}", getDataName(UNIPROT_DATA), uniProtFiles.get(0).getName(), uniProtChunksPath); + Files.createDirectories(uniProtChunksPath); + splitUniprot(proteinPath.resolve(uniProtFiles.get(0).getName()), uniProtChunksPath); + + // Prepare RocksDB + RocksDB rocksDb = getDBConnection(uniProtChunksPath); ObjectMapper mapper = new ObjectMapper(); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter jsonObjectWriter = mapper.writerFor(Entry.class); - proteinMap = new HashMap<>(30000); -// UniProtParser up = new UniProtParser(); + Map proteinMap = new HashMap<>(30000); + + // Parsing files try { - File[] files = uniprotFilesDir.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); + File[] files = uniProtChunksPath.toFile().listFiles((dir, name) -> name.endsWith(".xml") || name.endsWith(".xml.gz")); for (File file : files) { + logger.info(PARSING_LOG_MESSAGE, file); Uniprot uniprot = (Uniprot) UniProtParser.loadXMLInfo(file.toString(), UniProtParser.UNIPROT_CONTEXT); for (Entry entry : uniprot.getEntry()) { @@ -89,16 +108,16 @@ public void parse() throws IOException { for (OrganismNameType organismNameType : entry.getOrganism().getName()) { entryOrganism = organismNameType.getValue(); if (entryOrganism.equals(species)) { -// proteinMap.put(entry.getAccession().get(0), entry); rocksDb.put(entry.getAccession().get(0).getBytes(), jsonObjectWriter.writeValueAsBytes(entry)); } } } + logger.info(PARSING_DONE_LOG_MESSAGE, file); } logger.debug("Number of proteins stored in map: '{}'", proteinMap.size()); - if (interproFilePath != null && Files.exists(interproFilePath)) { - BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interproFilePath); + logger.info(PARSING_LOG_MESSAGE, interProFiles.get(0)); + try (BufferedReader interproBuffereReader = FileUtils.newBufferedReader(interProFiles.get(0).toPath())) { Set hashSet = new HashSet<>(proteinMap.keySet()); Set visited = new HashSet<>(30000); @@ -114,7 +133,6 @@ public void parse() throws IOException { iprAdded = false; BigInteger start = BigInteger.valueOf(Integer.parseInt(fields[4])); BigInteger end = BigInteger.valueOf(Integer.parseInt(fields[5])); -// for (FeatureType featureType : proteinMap.get(fields[0]).getFeature()) { byte[] bytes = rocksDb.get(fields[0].getBytes()); Entry entry = mapper.readValue(bytes, Entry.class); for (FeatureType featureType : entry.getFeature()) { @@ -145,7 +163,6 @@ public void parse() throws IOException { locationType.setEnd(positionType2); featureType.setLocation(locationType); -// proteinMap.get(fields[0]).getFeature().add(featureType); bytes = rocksDb.get(fields[0].getBytes()); entry = mapper.readValue(bytes, Entry.class); entry.getFeature().add(featureType); @@ -158,11 +175,13 @@ public void parse() throws IOException { } if (++numInterProLinesProcessed % 10000000 == 0) { - logger.debug("{} InterPro lines processed. {} unique proteins processed", - numInterProLinesProcessed, numUniqueProteinsProcessed); + logger.debug("{} {} lines processed. {} unique proteins processed", numInterProLinesProcessed, + getDataName(INTERPRO_DATA), numUniqueProteinsProcessed); } } - interproBuffereReader.close(); + logger.info(PARSING_DONE_LOG_MESSAGE, interProFiles.get(0)); + } catch (IOException e) { + throw new CellBaseException("Error parsing " + getDataName(INTERPRO_DATA) + " file: " + interProFiles.get(0), e); } // Serialize and save results @@ -173,24 +192,70 @@ public void parse() throws IOException { } rocksDb.close(); - } catch (JAXBException | RocksDBException e) { - e.printStackTrace(); + } catch (JAXBException | RocksDBException | IOException e) { + throw new CellBaseException("Error parsing " + getDataName(PROTEIN_DATA) + " files", e); } + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PROTEIN_DATA)); } - private RocksDB getDBConnection() { - // a static method that loads the RocksDB C++ library. + private RocksDB getDBConnection(Path uniProtChunksPath) throws CellBaseException { + // A static method that loads the RocksDB C++ library RocksDB.loadLibrary(); - // the Options class contains a set of configurable DB options - // that determines the behavior of a database. + // The Options class contains a set of configurable DB options that determines the behavior of a database Options options = new Options().setCreateIfMissing(true); try { - return RocksDB.open(options, uniprotFilesDir.resolve("integration.idx").toString()); + return RocksDB.open(options, uniProtChunksPath.resolve("integration.idx").toString()); } catch (RocksDBException e) { - // do some error handling - e.printStackTrace(); - System.exit(1); + throw new CellBaseException("Error preparing RocksDB", e); + } + } + + private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + pw.print(""); + pw.close(); + } finally { + if (pw != null) { + pw.close(); + } } - return null; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java index 8aba7c9dda..348d22a07d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PubMedBuilder.java @@ -16,63 +16,71 @@ package org.opencb.cellbase.lib.builders; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.biodata.formats.pubmed.PubMedParser; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticle; import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticleSet; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.download.PubMedDownloadManager; import org.opencb.commons.utils.FileUtils; -import org.slf4j.LoggerFactory; -import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; +import static org.opencb.cellbase.lib.EtlCommons.getDataName; + public class PubMedBuilder extends CellBaseBuilder { - private Path pubmedDir; - private CellBaseFileSerializer fileSerializer; + private Path pubMedDownloadPath; + private CellBaseConfiguration configuration; - public PubMedBuilder(Path pubmedDir, CellBaseFileSerializer serializer) { + public PubMedBuilder(Path pubMedDownloadPath, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); - - this.fileSerializer = serializer; - this.pubmedDir = pubmedDir; - - logger = LoggerFactory.getLogger(PubMedBuilder.class); + this.pubMedDownloadPath = pubMedDownloadPath; + this.configuration = configuration; } @Override public void parse() throws Exception { - // Check input folder - FileUtils.checkPath(pubmedDir); + logger.info(BUILDING_LOG_MESSAGE, getDataName(PUBMED_DATA)); - logger.info("Parsing PubMed files..."); + // Check input folder + FileUtils.checkPath(pubMedDownloadPath); - for (File file : pubmedDir.toFile().listFiles()) { - if (file.isFile() && (file.getName().endsWith("gz") || file.getName().endsWith("xml"))) { - String name = file.getName().split("\\.")[0]; + // Check PubMed files before parsing them + List pubMedFilenames = PubMedDownloadManager.getPubMedFilenames(configuration.getDownload().getPubmed()); + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + if (!Files.exists(pubMedPath)) { + throw new CellBaseException("Expected PubMed file " + pubMedFilename + ", but it was not found at " + pubMedDownloadPath); + } + } + for (String pubMedFilename : pubMedFilenames) { + Path pubMedPath = pubMedDownloadPath.resolve(pubMedFilename); + String basename = pubMedFilename.split("\\.")[0]; - ObjectWriter objectWriter = new ObjectMapper().writerFor(PubmedArticle.class); - PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(file.getAbsolutePath()); + PubmedArticleSet pubmedArticleSet = (PubmedArticleSet) PubMedParser.loadXMLInfo(pubMedPath.toAbsolutePath().toString()); - List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); - logger.info("Parsing PubMed file {} of {} articles ...", file.getName(), objects.size()); - int counter = 0; - for (Object object : objects) { - PubmedArticle pubmedArticle = (PubmedArticle) object; - fileSerializer.serialize(pubmedArticle, name); - if (++counter % 2000 == 0) { - logger.info("\t\t" + counter + " articles"); - } + List objects = pubmedArticleSet.getPubmedArticleOrPubmedBookArticle(); + logger.info(PARSING_LOG_MESSAGE, pubMedPath); + int counter = 0; + for (Object object : objects) { + PubmedArticle pubmedArticle = (PubmedArticle) object; + ((CellBaseFileSerializer) serializer).serialize(pubmedArticle, basename); + if (++counter % 2000 == 0) { + logger.info("{} articles", counter); } - fileSerializer.close(); - logger.info("\t\tDone: " + counter + " articles."); } + serializer.close(); + + String logMsg = pubMedPath + " (" + counter + " articles)"; + logger.info(PARSING_DONE_LOG_MESSAGE, logMsg); } - logger.info("Parsing PubMed files finished."); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(PUBMED_DATA)); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 48b0cd1d0d..8f03a801f2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -24,26 +24,43 @@ import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.rocksdb.RocksDBException; +import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RefSeqGeneBuilder extends CellBaseBuilder { + private Path downloadPath; + private Map transcriptDict; private Map exonDict; private Path gtfFile; private Path fastaFile; - private Path proteinFastaFile, cdnaFastaFile; - private Path maneFile, lrgFile, disgenetFile, hpoFile, geneDrugFile, miRTarBaseFile; - private Path cancerGeneCensus, cancerHotspot; - private Path tso500File, eglhHaemOncFile; + private Path proteinFastaFile; + private Path cdnaFastaFile; + private Path maneFile; + private Path lrgFile; + private Path disgenetFile; + private Path hpoFile; + private Path geneDrugFile; + private Path miRTarBaseFile; + private Path cancerGeneCensus; + private Path cancerHotspot; + private Path tso500File; + private Path eglhHaemOncFile; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); - private final String status = "KNOWN"; + private static final String KNOWN_STATUS = "KNOWN"; private static final String SOURCE = ParamConstants.QueryParams.REFSEQ.key(); private Gene gene = null; private Transcript transcript = null; @@ -52,85 +69,95 @@ public class RefSeqGeneBuilder extends CellBaseBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; - - public RefSeqGeneBuilder(Path refSeqDirectoryPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseSerializer serializer) { super(serializer); + this.downloadPath = downloadPath; this.speciesConfiguration = speciesConfiguration; - getGtfFileFromDirectoryPath(refSeqDirectoryPath); - getFastaFileFromDirectoryPath(refSeqDirectoryPath); - getProteinFastaFileFromDirectoryPath(refSeqDirectoryPath); - getCdnaFastaFileFromDirectoryPath(refSeqDirectoryPath); - setAnnotationFiles(refSeqDirectoryPath); - transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); } - private void setAnnotationFiles(Path refSeqDirectoryPath) { - Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); - maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); - lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); - geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); - disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); - hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); - cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); - cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); - tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); - eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); - miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); - } - - private void getGtfFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".gtf") || fileName.endsWith(".gtf.gz")) { - gtfFile = refSeqDirectoryPath.resolve(fileName); - break; - } + public void check() throws Exception { + if (checked) { + return; } - } - private void getFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("genomic.fna") || fileName.endsWith("genomic.fna.gz")) { - fastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } - } + String refSeqGeneLabel = getDataName(REFSEQ_DATA) + " " + getDataName(GENE_DATA); + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); - private void getProteinFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith(".faa") || fileName.endsWith(".faa.gz")) { - proteinFastaFile = refSeqDirectoryPath.resolve(fileName); - break; + // Sanity check + checkDirectory(downloadPath, refSeqGeneLabel); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); } } - } - private void getCdnaFastaFileFromDirectoryPath(Path refSeqDirectoryPath) { - for (String fileName : refSeqDirectoryPath.toFile().list()) { - if (fileName.endsWith("cdna.fna") || fileName.endsWith("cdna.fna.gz")) { - cdnaFastaFile = refSeqDirectoryPath.resolve(fileName); - break; - } - } + // Check RefSeq files + List files = checkFiles(refSeqGeneLabel, REFSEQ_DATA, downloadPath, 4); + gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath(); + proteinFastaFile = files.stream().filter(f -> f.getName().contains("_protein")).findFirst().get().toPath(); + cdnaFastaFile = files.stream().filter(f -> f.getName().contains("_rna")).findFirst().get().toPath(); + fastaFile = files.stream().filter(f -> f.getName().contains("_genomic.fna")).findFirst().get().toPath(); + + // Check common files + maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1); + disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + // cancerGeneCensus = ; + // tso500File = ; + // eglhHaemOncFile = ; + + // Check regulation files + // mirtarbase + // The downloaded .xlsx file contains errors and it has to be fixed manually + logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA)); + Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA); + List mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve( + getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString()) + .collect(Collectors.toList()); + if (mirTarBaseFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath + + ", but currently there are " + mirTarBaseFiles.size() + " files"); + } + // The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually + if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) { + throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at " + + downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0)); + } + miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION)); + if (!Files.exists(miRTarBaseFile)) { + throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist"); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); + checked = true; } public void parse() throws Exception { + check(); + // Preparing the fasta file for fast accessing FastaIndex fastaIndex = null; if (fastaFile != null) { fastaIndex = new FastaIndex(fastaFile); } - // index protein sequences for later + // Index protein sequences for later + logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, cancerGeneCensus, cancerHotspot, tso500File, eglhHaemOncFile); + logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); - logger.info("Parsing RefSeq gtf..."); + logger.info(PARSING_LOG_MESSAGE, gtfFile); GtfReader gtfReader = new GtfReader(gtfFile); Gtf gtf; @@ -164,22 +191,24 @@ public void parse() throws Exception { } } - // add xrefs to last transcript + // Add xrefs to last transcript addXrefs(transcript, geneDbxrefs, exonDbxrefs); - // last gene must be serialized + // Last gene must be serialized store(); - // cleaning + // Close gtfReader.close(); serializer.close(); if (fastaIndex != null) { fastaIndex.close(); } indexer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, gtfFile); } - // store right before parsing the previous gene, or the very last gene. + // Store right before parsing the previous gene, or the very last gene. private void store() { serializer.serialize(gene); reset(); @@ -235,7 +264,7 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde null, indexer.getMirnaTargets(geneName), indexer.getCancerGeneCensus(geneName), indexer.getCancerHotspot(geneName)); gene = new Gene(geneId, geneName, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), "1", geneBiotype, - status, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); + KNOWN_STATUS, SOURCE, geneDescription, new ArrayList<>(), null, geneAnnotation); geneDbxrefs = parseXrefs(gtf); } @@ -567,7 +596,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if ("mRNA".equals(biotype)) { biotype = "protein_coding"; } - transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, status, + transcript = new Transcript(transcriptId, name, chromosome, gtf.getStart(), gtf.getEnd(), gtf.getStrand(), biotype, KNOWN_STATUS, 0, 0, 0, 0, 0, indexer.getCdnaFasta(transcriptId), "", "", "", version, SOURCE, new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); @@ -644,6 +673,20 @@ private String getSequenceName(String fullSequenceName) { return fullSequenceName; } +// private void setAnnotationFiles(Path refSeqDirectoryPath) { +// Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); +// maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); +// lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); +// geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); +// disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); +// hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); +// cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); +// cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); +// tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); +// eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); +// miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); +// } + static { REFSEQ_CHROMOSOMES.put("NC_000001", "1"); REFSEQ_CHROMOSOMES.put("NC_000002", "2"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 45520161f5..9aae170ce2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -16,25 +16,16 @@ package org.opencb.cellbase.lib.builders; -import org.apache.commons.lang.StringUtils; -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.opencb.biodata.formats.io.FileFormatException; -import org.opencb.biodata.models.core.MirnaTarget; -import org.opencb.biodata.models.core.TargetGene; -import org.opencb.biodata.models.variant.avro.GeneDrugInteraction; -import org.opencb.biodata.models.variant.avro.GeneTraitAssociation; -import org.opencb.commons.utils.FileUtils; +import org.opencb.cellbase.core.exception.CellBaseException; import org.rocksdb.RocksDBException; -import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; -public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer{ +import static org.opencb.cellbase.lib.EtlCommons.REFSEQ_DATA; + +public class RefSeqGeneBuilderIndexer extends GeneBuilderIndexer { public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { super(refSeqDirectoryPath); @@ -42,249 +33,17 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot, Path tso500File, - Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException { - indexManeMapping(maneFile, "refseq"); - indexLrgMapping(lrgFile, "refseq"); + Path eglhHaemOncFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { + indexManeMapping(maneFile, REFSEQ_DATA); + indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); indexDiseases(hpoFilePath, disgenetFile); indexMiRTarBase(miRTarBaseFile); - indexCancerGeneCensus(cancerGeneGensus); +// indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); - indexTSO500(tso500File); - indexEGLHHaemOnc(eglhHaemOncFile); - } - - private void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { - if (geneDrugFile != null && Files.exists(geneDrugFile) && Files.size(geneDrugFile) > 0) { - logger.info("Loading gene-drug interaction data from '{}'", geneDrugFile); - BufferedReader br = FileUtils.newBufferedReader(geneDrugFile); - - // Skip header - br.readLine(); - - int lineCounter = 1; - String line; - String currentGene = ""; - List drugs = new ArrayList<>(); - while ((line = br.readLine()) != null) { - String[] parts = line.split("\t"); - String geneName = parts[0]; - if (currentGene.equals("")) { - currentGene = geneName; - } else if (!currentGene.equals(geneName)) { - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - drugs = new ArrayList<>(); - currentGene = geneName; - } - - String source = null; - if (parts.length >= 4) { - source = parts[3]; - } - - String interactionType = null; - if (parts.length >= 5) { - interactionType = parts[4]; - } - - String drugName = null; - if (parts.length >= 8) { - // if drug name column is empty, use drug claim name instead - drugName = StringUtils.isEmpty(parts[7]) ? parts[6] : parts[7]; - } - if (StringUtils.isEmpty(drugName)) { - // no drug name - continue; - } - - String chemblId = null; - if (parts.length >= 9) { - chemblId = parts[8]; - } - - List publications = new ArrayList<>(); - if (parts.length >= 10 && parts[9] != null) { - publications = Arrays.asList(parts[9].split(",")); - } - - GeneDrugInteraction drug = new GeneDrugInteraction( - geneName, drugName, source, null, null, interactionType, chemblId, publications); - drugs.add(drug); - lineCounter++; - } - br.close(); - // update last gene - rocksDbManager.update(rocksdb, currentGene + DRUGS_SUFFIX, drugs); - } else { - logger.warn("Gene drug file " + geneDrugFile + " not found"); - logger.warn("Ignoring " + geneDrugFile); - } - } - - public List getDrugs(String id) throws RocksDBException, IOException { - String key = id + DRUGS_SUFFIX; - return rocksDbManager.getDrugs(rocksdb, key); - } - - private void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { - Map> geneDiseaseAssociationMap = new HashMap<>(50000); - - String line; - if (hpoFilePath != null && hpoFilePath.toFile().exists() && Files.size(hpoFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String omimId = fields[6]; - String geneSymbol = fields[3]; - String hpoId = fields[0]; - String diseaseName = fields[1]; - GeneTraitAssociation disease = - new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), "hpo"); - addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); - } - bufferedReader.close(); - } - - if (disgenetFilePath != null && disgenetFilePath.toFile().exists() && Files.size(disgenetFilePath) > 0) { - BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath); - // skip first header line - bufferedReader.readLine(); - while ((line = bufferedReader.readLine()) != null) { - String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), "disgenet"); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); - } - bufferedReader.close(); - } - - for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); - } - } - - public List getDiseases(String id) throws RocksDBException, IOException { - String key = id + DISEASE_SUFFIX; - return rocksDbManager.getDiseases(rocksdb, key); - } - - private void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { - if (miRTarBaseFile != null && Files.exists(miRTarBaseFile) && Files.size(miRTarBaseFile) > 0) { - logger.info("Loading mirna targets from '{}'", miRTarBaseFile); - FileInputStream file = new FileInputStream(miRTarBaseFile.toFile()); - Workbook workbook = new XSSFWorkbook(file); - Sheet sheet = workbook.getSheetAt(0); - Iterator iterator = sheet.iterator(); - String currentMiRTarBaseId = null; - String currentMiRNA = null; - String currentGene = null; - List targetGenes = new ArrayList(); - Map> geneToMirna = new HashMap(); - while (iterator.hasNext()) { - - Row currentRow = iterator.next(); - Iterator cellIterator = currentRow.iterator(); - - Cell cell = cellIterator.next(); - String miRTarBaseId = cell.getStringCellValue(); - - // skip header - if (miRTarBaseId.startsWith("miRTarBase")) { - continue; - } - - if (currentMiRTarBaseId == null) { - currentMiRTarBaseId = miRTarBaseId; - } - - cell = cellIterator.next(); - String miRNA = cell.getStringCellValue(); - if (currentMiRNA == null) { - currentMiRNA = miRNA; - } - - // species - cellIterator.next(); - - cell = cellIterator.next(); - String geneName = cell.getStringCellValue(); - if (currentGene == null) { - currentGene = geneName; - } - - // entrez - cellIterator.next(); - // species - cellIterator.next(); - - if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) { - // new entry, store current one - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - targetGenes = new ArrayList(); - currentGene = geneName; - currentMiRTarBaseId = miRTarBaseId; - currentMiRNA = miRNA; - } - - // experiment - cell = cellIterator.next(); - String experiment = cell.getStringCellValue(); - - // support type - cell = cellIterator.next(); - String supportType = cell.getStringCellValue(); - - // pubmeds - cell = cellIterator.next(); - String pubmed = null; - // seems to vary, so check both - if (cell.getCellType().equals(CellType.NUMERIC)) { - pubmed = String.valueOf(cell.getNumericCellValue()); - } else { - pubmed = cell.getStringCellValue(); - } - - targetGenes.add(new TargetGene(experiment, supportType, pubmed)); - } - - // parse last entry - MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, - targetGenes); - addValueToMapElement(geneToMirna, currentGene, miRnaTarget); - - for (Map.Entry> entry : geneToMirna.entrySet()) { - rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue()); - } - } else { - logger.error("mirtarbase file not found"); - } +// indexTSO500(tso500File); +// indexEGLHHaemOnc(eglhHaemOncFile); } - - public List getMirnaTargets(String geneName) throws RocksDBException, IOException { - String key = geneName + MIRTARBASE_SUFFIX; - return rocksDbManager.getMirnaTargets(rocksdb, key); - } - - private static void addValueToMapElement(Map> map, String key, T value) { - if (map.containsKey(key)) { - map.get(key).add(value); - } else { - List valueList = new ArrayList<>(); - valueList.add(value); - map.put(key, valueList); - } - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java index 03fc3a1cd6..83eccb9885 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryFeatureBuilder.java @@ -16,60 +16,152 @@ package org.opencb.cellbase.lib.builders; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.feature.gff.Gff2; import org.opencb.biodata.formats.feature.gff.io.Gff2Reader; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.core.RegulatoryFeature; +import org.opencb.biodata.models.core.RegulatoryPfm; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; +import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; +import java.io.File; import java.io.IOException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.opencb.cellbase.lib.EtlCommons.*; public class RegulatoryFeatureBuilder extends CellBaseBuilder { - private final Path gffFile; - protected Set regulatoryFeatureSet; + private Path regulationPath; + + private Set regulatoryFeatureSet; - public RegulatoryFeatureBuilder(Path regulatoryDirectoryPath, CellBaseSerializer serializer) { + public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) { super(serializer); - gffFile = regulatoryDirectoryPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); + this.regulationPath = regulationPath; } @Override public void parse() throws Exception { - logger.info("Parsing regulatory features..."); - if (Files.exists(gffFile)) { - parseGffFile(gffFile); - } else { - logger.warn("No regulatory features GFF file found {}", EtlCommons.REGULATORY_FEATURES_FILE); - logger.warn("Skipping regulatory features GFF file parsing. Regulatory feature data models will not be built."); + logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA)); + + // Sanity check + checkDirectory(regulationPath, getDataName(REGULATION_DATA)); + + // Check build regulatory files + DataSource dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile()); + List regulatoryFiles = checkFiles(dataSource, regulationPath, getDataCategory(REGULATORY_BUILD_DATA) + "/" + + getDataName(REGULATORY_BUILD_DATA)); + if (regulatoryFiles.size() != 1) { + throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are " + + regulatoryFiles.size() + " files"); } + + // Check motif features files + dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile()); + List motifFeaturesFiles = checkFiles(dataSource, regulationPath, getDataCategory(MOTIF_FEATURES_DATA) + "/" + + getDataName(MOTIF_FEATURES_DATA)); + if (motifFeaturesFiles.size() != 2) { + throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are " + + motifFeaturesFiles.size() + " files"); + } + + // Downloading and building pfm matrices + File motifFile = motifFeaturesFiles.get(0).getName().endsWith("tbi") ? motifFeaturesFiles.get(1) : motifFeaturesFiles.get(0); + loadPfmMatrices(motifFile.toPath(), serializer.getOutdir()); + + // Parse regulatory build features + parseGffFile(regulatoryFiles.get(0).toPath()); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA)); } protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException { + logger.info(PARSING_LOG_MESSAGE, regulatoryFeatureFile); + + // Create and populate regulatory feature set regulatoryFeatureSet = new HashSet<>(); - if (regulatoryFeatureFile != null && Files.exists(regulatoryFeatureFile) && !Files.isDirectory(regulatoryFeatureFile) - && Files.size(regulatoryFeatureFile) > 0) { - Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile); + try (Gff2Reader regulatoryFeatureReader = new Gff2Reader(regulatoryFeatureFile)) { Gff2 feature; while ((feature = regulatoryFeatureReader.read()) != null) { regulatoryFeatureSet.add(feature); } - regulatoryFeatureReader.close(); } - int i = 0; // Serialize and save results for (Gff2 feature : regulatoryFeatureSet) { - // ID=TF_binding_site:ENSR00000243312; + // In order to get the ID we split the attribute format: ID=TF_binding_site:ENSR00000243312; .... String id = feature.getAttribute().split(";")[0].split(":")[1]; RegulatoryFeature regulatoryFeature = new RegulatoryFeature(id, feature.getSequenceName(), feature.getFeature(), feature.getStart(), feature.getEnd()); serializer.serialize(regulatoryFeature); } serializer.close(); + + logger.info(PARSING_DONE_LOG_MESSAGE, regulatoryFeatureFile); + } + + private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException, + InterruptedException { + Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_BASENAME + ".json.gz"); + logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile); + if (Files.exists(regulatoryPfmPath)) { + logger.info("{} is already built", regulatoryPfmPath); + return; + } + + Set motifIds = new HashSet<>(); + try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { + Gff2 tfbsMotifFeature; + Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); + while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { + String pfmId = getMatrixId(filePattern, tfbsMotifFeature); + if (StringUtils.isNotEmpty(pfmId)) { + motifIds.add(pfmId); + } + } + } + + ObjectMapper mapper = new ObjectMapper(); + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true); + if (logger.isInfoEnabled()) { + logger.info("Looking up {} PFMs", motifIds.size()); + } + for (String pfmId : motifIds) { + String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId + + "?unit=frequencies;content-type=application/json"; + URL url = new URL(urlString); + RegulatoryPfm regulatoryPfm = mapper.readValue(url, RegulatoryPfm.class); + serializer.serialize(regulatoryPfm); + // https://github.com/Ensembl/ensembl-rest/wiki/Rate-Limits + TimeUnit.MILLISECONDS.sleep(250); + } + serializer.close(); + + logger.info("Downloading and building PFM matrices at {} done.", regulatoryPfmPath); + } + + private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { + Matcher matcher = pattern.matcher(tfbsMotifFeature.getAttribute()); + if (matcher.find()) { + return matcher.group(0); + } + return null; + } + + public Set getRegulatoryFeatureSet() { + return regulatoryFeatureSet; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java deleted file mode 100644 index 3727ac4a69..0000000000 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RegulatoryRegionBuilder.java +++ /dev/null @@ -1,607 +0,0 @@ -/* - * Copyright 2015-2020 OpenCB - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.opencb.cellbase.lib.builders; - -import org.opencb.biodata.models.core.RegulatoryFeature; -import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.sql.*; -import java.util.*; - -/** - * User: fsalavert. - * Date: 4/10/13 - * Time: 10:14 AM - */ -@Deprecated -public class RegulatoryRegionBuilder extends CellBaseBuilder { - - private static final int CHUNK_SIZE = 2000; - private static final String REGULATORY_FEATURES = "regulatory_features"; - @Deprecated - private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features"; - private static final String MOTIF_FEATURES = "motif_features"; - private static final String FEATURE_TYPE = "feature_type"; - private static final String ID = "id"; - private static final String BINDING_MATRIX = "binding_matrix"; - private static final String MOTIF_FEATURE_TYPE = "motif_feature_type"; - private Path regulatoryRegionPath; - - public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) { - super(serializer); - - this.regulatoryRegionPath = regulatoryRegionFilesDir; - - } - - public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath) - throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group"); - List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT"); - - // Path regulatoryRegionPath = regulationDir.toPath(); - - Path filePath; - - filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes); - - filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE); - createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - // TODO: REMOVE - // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED - filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz"); - createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes); - - - filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz"); - createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes); - // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED - - - -// GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame"); -// GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT"); - filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz"); - if (Files.exists(filePath)) { - createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes); - } - - } - - @Override - public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException { - if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) { - throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read"); - } - - // Create the SQLite databases - createSQLiteRegulatoryFiles(regulatoryRegionPath); - - String chunkIdSuffix = CHUNK_SIZE / 1000 + "k"; - - Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db"); - Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db"); - Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db"); - Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db"); - Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db"); - Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db"); - - List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath, - deprecatedMotifFilePath, deprecatedRegulatoryFilePath); - List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features", - DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell"); - - if (Files.exists(mirnaFilePath)) { - filePaths.add(mirnaFilePath); - tableNames.add("mirna_uniq"); - } - - // Fetching and joining all chromosomes found in the different databases - Set setChr = new HashSet<>(); - setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES)); - setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features")); - setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES)); - setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell")); - if (Files.exists(mirnaFilePath)) { - setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq")); - } - - List chromosomes = new ArrayList<>(setChr); - List regulatoryFeatures; - HashSet chunksHash; - for (String chromosome : chromosomes) { - for (int i = 0; i < tableNames.size(); i++) { - chunksHash = new HashSet<>(); - regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome); - for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) { - int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE); - int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE); - - List chunkIds = new ArrayList<>(); - String chunkId; - for (int j = firstChunkId; j <= lastChunkId; j++) { - chunkId = chromosome + "_" + j + "_" + chunkIdSuffix; - chunkIds.add(chunkId); - //count chunks - if (!chunksHash.contains(j)) { - chunksHash.add(j); - } - } -// regulatoryFeature.setChunkIds(chunkIds); - - // remove 'chr' prefix -// if (genericFeature.getChromosome() != null) { -// genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", "")); -// } - serializer.serialize(regulatoryFeature); - } - } - } - } - - - public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes) - throws ClassNotFoundException, IOException, SQLException { - int limitRows = 100000; - int batchCount = 0; - - if (!Files.exists(filePath) || Files.size(filePath) == 0) { - return; - } - - Path dbPath = Paths.get(filePath.toString() + ".db"); - if (Files.exists(dbPath) && Files.size(dbPath) > 0) { - return; - } - - BufferedReader br = FileUtils.newBufferedReader(filePath); - - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion - - //Create table query - Statement createTables = conn.createStatement(); - - StringBuilder sbQuery = new StringBuilder(); - sbQuery.append("CREATE TABLE if not exists " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { //columnNames and columnTypes must have the same size - sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ","); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - - System.out.println(sbQuery.toString()); - createTables.executeUpdate(sbQuery.toString()); - - //Prepare insert query - sbQuery = new StringBuilder(); - sbQuery.append("INSERT INTO " + tableName + "("); - for (int i = 0; i < columnNames.size(); i++) { - sbQuery.append("'" + columnNames.get(i) + "',"); - } - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(") values ("); - sbQuery.append(repeat("?,", columnNames.size())); - sbQuery.deleteCharAt(sbQuery.length() - 1); - sbQuery.append(")"); - System.out.println(sbQuery.toString()); - - PreparedStatement ps = conn.prepareStatement(sbQuery.toString()); - - //Read file - String line = null; - while ((line = br.readLine()) != null) { - - insertByType(ps, getFields(line, tableName), columnTypes); - ps.addBatch(); - batchCount++; - - //commit batch - if (batchCount % limitRows == 0 && batchCount != 0) { - ps.executeBatch(); - conn.commit(); - } - - } - br.close(); - - //Execute last Batch - ps.executeBatch(); - conn.commit(); - - //Create index - System.out.println("creating indices..."); - createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")"); - System.out.println("indices created."); - - conn.commit(); - conn.close(); - } - - public List getChromosomesList(Path dbPath, String tableName) throws IOException { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - List chromosomes = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName); -// ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'"); - - while (rs.next()) { - chromosomes.add(rs.getString(1)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return chromosomes; - } - - public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) { - - try { - FileUtils.checkFile(dbPath); - } catch (IOException e) { - logger.warn(e.getMessage()); - return Collections.emptyList(); - } - - Connection conn; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'"); -// ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'"); - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) { - Connection conn = null; - List regulatoryFeatures = new ArrayList<>(); - try { - Class.forName("org.sqlite.JDBC"); - conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString()); - - Statement query = conn.createStatement(); - ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start); - - while (rs.next()) { - regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName)); - } - conn.close(); - - } catch (ClassNotFoundException | SQLException e) { - e.printStackTrace(); - } - return regulatoryFeatures; - } - - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException { - RegulatoryFeature regulatoryFeature = null; - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - regulatoryFeature = getRegulatoryFeature(rs); - break; - case MOTIF_FEATURES: - regulatoryFeature = getMotifFeature(rs); - break; - case "annotated_features": - regulatoryFeature = getAnnotatedFeature(rs); - break; - case "regulatory_features_multicell": - regulatoryFeature = getDeprecatedRegulatoryFeature(rs); - break; - case DEPRECATED_MOTIF_FEATURES: - regulatoryFeature = getDeprecatedMotifFeature(rs); - break; - case "mirna_uniq": - regulatoryFeature = getMirnaFeature(rs); - break; - default: - break; - } - return regulatoryFeature; - } - - private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - // Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the - // it was being done from the previous ENSEMBL files - regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE)); - - regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setId(groupFields.get(ID)); - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_")); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - - return regulatoryFeature; - } - - private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setName(groupFields.get("name")); - regulatoryFeature.setAlias(groupFields.get("alias")); - regulatoryFeature.setFeatureClass(groupFields.get("class")); - regulatoryFeature.getCellTypes().add(groupFields.get("cell_type")); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - regulatoryFeature.setFrame(rs.getString(9)); - - return regulatoryFeature; - } - - @Deprecated - private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3) + "_motif"); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - String[] split = groupFields.get("name").split(":"); - regulatoryFeature.setName(split[0]); - regulatoryFeature.setMatrix(split[1]); - - return regulatoryFeature; - } - - private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException { - // GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3 - RegulatoryFeature regulatoryFeature = new RegulatoryFeature(); - Map groupFields = getGroupFields(rs.getString(9)); - - regulatoryFeature.setChromosome(rs.getString(1)); - regulatoryFeature.setSource(rs.getString(2)); - regulatoryFeature.setFeatureType(rs.getString(3)); - regulatoryFeature.setStart(rs.getInt(4)); - regulatoryFeature.setEnd(rs.getInt(5)); - regulatoryFeature.setScore(rs.getString(6)); - regulatoryFeature.setStrand(rs.getString(7)); - regulatoryFeature.setFrame(rs.getString(8)); - - regulatoryFeature.setFeatureClass("microRNA"); - regulatoryFeature.setName(groupFields.get("name")); - - return regulatoryFeature; - } - - private static Map getGroupFields(String group) { - //process group column - Map groupFields = new HashMap<>(); - String[] attributeFields = group.split(";"); - String[] attributeKeyValue; - for (String attributeField : attributeFields) { - attributeKeyValue = attributeField.trim().split("="); - groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]); - } - return groupFields; - } - - - public static List getFields(String line, String tableName) { - List fields = new ArrayList<>(); - switch (tableName.toLowerCase()) { - case REGULATORY_FEATURES: - fields = getRegulatoryFeaturesFields(line); - break; - case MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "annotated_features": - fields = getAnnotatedFeaturesFields(line); - break; - case "regulatory_features_multicell": - fields = getRegulatoryFeaturesFields(line); - break; - case DEPRECATED_MOTIF_FEATURES: - fields = getMotifFeaturesFields(line); - break; - case "mirna_uniq": - fields = getMirnaFeaturesFields(line); - break; - default: - break; - } - return fields; - } - - @Deprecated - public static List getAnnotatedFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getRegulatoryFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMotifFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static List getMirnaFeaturesFields(String line) { - String[] fields = line.split("\t"); - fields[0] = fields[0].replace("chr", ""); - return Arrays.asList(fields); - } - - public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException { - //Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html - String raw; - String type; - if (types.size() == fields.size()) { - for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size - int sqliteIndex = i + 1; - raw = fields.get(i); - type = types.get(i); - - switch (type) { - case "INTEGER": - case "INT": - ps.setInt(sqliteIndex, Integer.parseInt(raw)); - break; - case "REAL": - ps.setFloat(sqliteIndex, Float.parseFloat(raw)); - break; - case "TEXT": - ps.setString(sqliteIndex, raw); - break; - default: - ps.setString(sqliteIndex, raw); - break; - } - } - } - - } - - public String repeat(String s, int n) { - if (s == null) { - return null; - } - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < n; i++) { - sb.append(s); - } - return sb.toString(); - } - - private int getChunkId(int position, int chunksize) { - if (chunksize <= 0) { - return position / CHUNK_SIZE; - } else { - return position / chunksize; - } - } - - private int getChunkStart(int id, int chunksize) { - if (chunksize <= 0) { - return (id == 0) ? 1 : id * CHUNK_SIZE; - } else { - return (id == 0) ? 1 : id * chunksize; - } - } - - private int getChunkEnd(int id, int chunksize) { - if (chunksize <= 0) { - return (id * CHUNK_SIZE) + CHUNK_SIZE - 1; - } else { - return (id * chunksize) + chunksize - 1; - } - } -} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java index d37765e0b6..5ffabf747b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RepeatsBuilder.java @@ -18,8 +18,10 @@ import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.avro.Repeat; -import org.opencb.cellbase.lib.EtlCommons; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.commons.ProgressLogger; import org.opencb.commons.utils.FileUtils; @@ -27,56 +29,78 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; + +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by fjlopez on 05/05/17. */ public class RepeatsBuilder extends CellBaseBuilder { - private static final String TRF = "trf"; - private static final String GSD = "genomicSuperDup"; - private static final String WM = "windowMasker"; + + private CellBaseConfiguration configuration; + private final Path filesDir; - public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer) { + public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) { super(serializer); this.filesDir = filesDir; + this.configuration = configuration; } @Override public void parse() throws Exception { + logger.info(BUILDING_LOG_MESSAGE, getDataName(REPEATS_DATA)); + + // Sanity check + checkDirectory(filesDir, getDataName(REPEATS_DATA)); - logger.info("Parsing repeats..."); - if (Files.exists(filesDir.resolve(EtlCommons.TRF_FILE))) { - parseTrfFile(filesDir.resolve(EtlCommons.TRF_FILE)); - } else { - logger.warn("No TRF file found {}", EtlCommons.TRF_FILE); - logger.warn("Skipping TRF file parsing. TRF data models will not be built."); + // Check Simple Repeats (TRF) filename + String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(trfFilename))) { + throw new CellBaseException(getMessageMissingFile(TRF_DATA, trfFilename, filesDir)); } - if (Files.exists(filesDir.resolve(EtlCommons.GSD_FILE))) { - parseGsdFile(filesDir.resolve(EtlCommons.GSD_FILE)); - } else { - logger.warn("No Genomic Super Duplications file found {}", EtlCommons.GSD_FILE); - logger.warn("Skipping Genomic Super Duplications file parsing. " - + "Genomic Super Duplications data models will not be built."); + // Check Genomic Super Duplications (GSD) file + String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID)) + .getFileName().toString(); + if (!Files.exists(filesDir.resolve(gsdFilename))) { + throw new CellBaseException(getMessageMissingFile(GSD_DATA, gsdFilename, filesDir)); } - if (Files.exists(filesDir.resolve(EtlCommons.WM_FILE))) { - parseWmFile(filesDir.resolve(EtlCommons.WM_FILE)); - } else { - logger.warn("No WindowMasker file found {}", EtlCommons.WM_FILE); - logger.warn("Skipping WindowMasker file parsing. WindowMasker data models will not be built."); + // Check Window Masker (WM) file + String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName() + .toString(); + if (!Files.exists(filesDir.resolve(wmFilename))) { + throw new CellBaseException(getMessageMissingFile(WM_DATA, wmFilename, filesDir)); } - logger.info("Done."); + + // Parse TRF file + logger.info(BUILDING_LOG_MESSAGE, getDataName(TRF_DATA)); + parseTrfFile(filesDir.resolve(trfFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(TRF_DATA)); + + // Parse GSD file + logger.info(BUILDING_LOG_MESSAGE, getDataName(GSD_DATA)); + parseGsdFile(filesDir.resolve(gsdFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GSD_DATA)); + + // Parse WM file + logger.info(BUILDING_LOG_MESSAGE, getDataName(WM_DATA)); + parseWmFile(filesDir.resolve(wmFilename)); + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(WM_DATA)); + + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REPEATS_DATA)); } - private void parseTrfFile(Path filePath) throws IOException { + private void parseTrfFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed TRF lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(TRF_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseTrfLine(line)); line = bufferedReader.readLine(); @@ -90,15 +114,15 @@ private Repeat parseTrfLine(String line) { return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]), - Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF); + Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_DATA); } - private void parseGsdFile(Path filePath) throws IOException { + private void parseGsdFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed GSD lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(GSD_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseGSDLine(line)); line = bufferedReader.readLine(); @@ -112,16 +136,16 @@ private Repeat parseGSDLine(String line) { return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null, - null, GSD); + null, GSD_DATA); } - private void parseWmFile(Path filePath) throws IOException { + private void parseWmFile(Path filePath) throws IOException, CellBaseException { try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) { String line = bufferedReader.readLine(); - ProgressLogger progressLogger = new ProgressLogger("Parsed WM lines:", - () -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000); + ProgressLogger progressLogger = new ProgressLogger(getMessageParsedLines(WM_DATA), () -> EtlCommons.countFileLines(filePath), + 200).setBatchSize(10000); while (line != null) { serializer.serialize(parseWmLine(line)); line = bufferedReader.readLine(); @@ -134,6 +158,16 @@ private Repeat parseWmLine(String line) { String[] parts = line.split("\t"); return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]), - Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM); + Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_DATA); + } + + private String getMessageMissingFile(String data, String filename, Path folder) throws CellBaseException { + return getDataName(data) + " file " + filename + " does not exist at " + folder; } + + private String getMessageParsedLines(String data) throws CellBaseException { + return "Parsed " + getDataName(data) + " lines:"; + } + } + diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java index 2ccf0cb2a1..06f38f28f0 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RevelScoreBuilder.java @@ -19,8 +19,8 @@ import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.Path; @@ -30,75 +30,95 @@ import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class RevelScoreBuilder extends CellBaseBuilder { - private Path revelFilePath = null; - private static final String SOURCE = "revel"; + private Path revelDownloadPath = null; - public RevelScoreBuilder(Path revelDirectoryPath, CellBaseSerializer serializer) { + public RevelScoreBuilder(Path revelDownloadPath, CellBaseSerializer serializer) { super(serializer); - this.revelFilePath = revelDirectoryPath.resolve("revel-v1.3_all_chromosomes.zip"); - logger = LoggerFactory.getLogger(ConservationBuilder.class); - + this.revelDownloadPath = revelDownloadPath; } @Override - public void parse() throws IOException { - logger.error("processing Revel file at " + revelFilePath.toAbsolutePath()); - ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFilePath))); + public void parse() throws IOException, CellBaseException { + String dataName = getDataName(REVEL_DATA); + String dataCategory = getDataCategory(REVEL_DATA); + + logger.info(CATEGORY_BUILDING_LOG_MESSAGE, dataCategory, dataName); + + // Sanity check + checkDirectory(revelDownloadPath, dataName); + + // Check ontology files + List revelFiles = checkFiles(dataSourceReader.readValue(revelDownloadPath.resolve(getDataVersionFilename(REVEL_DATA)) + .toFile()), revelDownloadPath, dataName); + if (revelFiles.size() != 1) { + throw new CellBaseException("One " + dataName + " file is expected, but currently there are " + revelFiles.size() + " files"); + } + + logger.info(PARSING_LOG_MESSAGE, revelFiles.get(0)); + + ZipInputStream zis = new ZipInputStream(new FileInputStream(String.valueOf(revelFiles.get(0)))); ZipEntry zipEntry = zis.getNextEntry(); - ZipFile zipFile = new ZipFile(String.valueOf(revelFilePath)); + ZipFile zipFile = new ZipFile(revelFiles.get(0).toString()); InputStream inputStream = zipFile.getInputStream(zipEntry); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); - - // skip header - String line = bufferedReader.readLine(); - String[] fields = null; - String lastEntry = null; - String currentEntry = null; - List scores = new ArrayList<>(); - MissenseVariantFunctionalScore predictions = null; - while ((line = bufferedReader.readLine()) != null) { - fields = line.split(","); - String chromosome = fields[0]; - if (".".equalsIgnoreCase(fields[2])) { - // 1,12855835,.,C,A,A,D,0.175 - // skip if invalid position - continue; - } - int position = Integer.parseInt(fields[2]); - String reference = fields[3]; - String alternate = fields[4]; - String aaReference = fields[5]; - String aaAlternate = fields[6]; - double score = Double.parseDouble(fields[7]); - - currentEntry = chromosome + position; - - // new chromosome + position, store previous entry - if (lastEntry != null && !currentEntry.equals(lastEntry)) { - serializer.serialize(predictions); - scores = new ArrayList<>(); - predictions = null; + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream))) { + // Skip header + bufferedReader.readLine(); + String[] fields; + String lastEntry = null; + String currentEntry; + List scores = new ArrayList<>(); + MissenseVariantFunctionalScore predictions = null; + String line; + while ((line = bufferedReader.readLine()) != null) { + fields = line.split(","); + String chromosome = fields[0]; + if (".".equalsIgnoreCase(fields[2])) { + // 1,12855835,.,C,A,A,D,0.175 + // skip if invalid position + continue; + } + int position = Integer.parseInt(fields[2]); + String reference = fields[3]; + String alternate = fields[4]; + String aaReference = fields[5]; + String aaAlternate = fields[6]; + double score = Double.parseDouble(fields[7]); + + currentEntry = chromosome + position; + + // new chromosome + position, store previous entry + if (lastEntry != null && !currentEntry.equals(lastEntry)) { + serializer.serialize(predictions); + scores = new ArrayList<>(); + predictions = null; + } + + if (predictions == null) { + predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, REVEL_DATA, scores); + } + + TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", alternate, + aaReference, aaAlternate, score); + scores.add(predictedScore); + lastEntry = chromosome + position; } - if (predictions == null) { - predictions = new MissenseVariantFunctionalScore(chromosome, position, reference, SOURCE, scores); - } - - TranscriptMissenseVariantFunctionalScore predictedScore = new TranscriptMissenseVariantFunctionalScore("", - alternate, aaReference, aaAlternate, score); - scores.add(predictedScore); - lastEntry = chromosome + position; + // Serialise last entry + serializer.serialize(predictions); } - // serialise last entry - serializer.serialize(predictions); + logger.info(PARSING_DONE_LOG_MESSAGE, revelFiles.get(0)); + // Close zis.close(); zipFile.close(); inputStream.close(); - bufferedReader.close(); + + logger.info(CATEGORY_BUILDING_DONE_LOG_MESSAGE, dataCategory, dataName); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java index cf8351cc54..3a178b4828 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RocksDbManager.java @@ -60,8 +60,11 @@ public RocksDB getDBConnection(String dbLocation) { Options options = new Options().setCreateIfMissing(true); RocksDB db = null; try { + if (!Files.exists(Paths.get(dbLocation))) { + Files.createDirectories(Paths.get(dbLocation)); + } return RocksDB.open(options, dbLocation); - } catch (RocksDBException e) { + } catch (RocksDBException | IOException e) { // do some error handling e.printStackTrace(); System.exit(1); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index a31bd8d5e6..951ea5c530 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -41,11 +41,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE; -import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION; - -//import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*; - /** * Created by fjlopez on 28/09/16. */ @@ -78,11 +73,15 @@ public class ClinVarIndexer extends ClinicalIndexer { private static final String DIPLOTYPE = "Diplotype"; private static final String VARIANT = "Variant"; private static final char CLINICAL_SIGNIFICANCE_SEPARATOR = '/'; + private final Path clinvarXMLFiles; private final Path clinvarSummaryFile; private final Path clinvarVariationAlleleFile; private final Path clinvarEFOFile; + + private final String version; private final String assembly; + private int numberSomaticRecords = 0; private int numberGermlineRecords = 0; private int numberNoDiseaseTrait = 0; @@ -94,15 +93,15 @@ public class ClinVarIndexer extends ClinicalIndexer { private static final Set RECESSIVE_TERM_SET = new HashSet<>(Arrays.asList(ModeOfInheritance.biallelic)); - public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, - Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly, - RocksDB rdb) throws IOException { + public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, Path clinvarEFOFile, + String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); this.rdb = rdb; this.clinvarXMLFiles = clinvarXMLFiles; this.clinvarSummaryFile = clinvarSummaryFile; this.clinvarVariationAlleleFile = clinvarVariationAlleleFile; this.clinvarEFOFile = clinvarEFOFile; + this.version = version; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; @@ -310,7 +309,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap) { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // Create a set to avoid situations like germline;germline;germline List alleleOrigin = null; if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) { @@ -391,7 +390,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, null); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..3f6e87b89c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -83,7 +83,7 @@ public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { .setDecomposeMNVs(true); if (genomeSequenceFilePath != null) { - logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); + logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath); variantNormalizerConfig.enableLeftAlign(genomeSequenceFilePath.toString()); } else { logger.info("Left alignment is NOT enabled."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index f574133ad7..e3c7ab3ff8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -19,165 +19,159 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.builders.CellBaseBuilder; +import org.opencb.commons.utils.FileUtils; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by fjlopez on 26/09/16. */ public class ClinicalVariantBuilder extends CellBaseBuilder { - private final Path clinvarXMLFile; - private final Path clinvarSummaryFile; - private final Path clinvarVariationAlleleFile; - private final Path clinvarEFOFile; - private final Path cosmicFile; - private final Path gwasFile; - private final Path dbsnpFile; + private final Path clinicalVariantPath; private final String assembly; - private final Path iarctp53GermlineFile; - private final Path iarctp53SomaticFile; - private final Path iarctp53GermlineReferencesFile; - private final Path iarctp53SomaticReferencesFile; private final Path genomeSequenceFilePath; - private final Path docmFile; - private final Path hgmdFile; - private boolean normalize = true; + private boolean normalize; - public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, - String assembly, CellBaseSerializer serializer) { - this(clinicalVariantFolder.resolve(EtlCommons.CLINVAR_XML_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE), - clinicalVariantFolder.resolve(EtlCommons.CLINVAR_EFO_FILE), - clinicalVariantFolder.resolve(EtlCommons.COSMIC_FILE), - clinicalVariantFolder.resolve(EtlCommons.GWAS_FILE), - clinicalVariantFolder.resolve(EtlCommons.DBSNP_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_GERMLINE_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_GERMLINE_REFERENCES_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_SOMATIC_FILE), - clinicalVariantFolder.resolve("datasets/" + EtlCommons.IARCTP53_SOMATIC_REFERENCES_FILE), - clinicalVariantFolder.resolve(EtlCommons.DOCM_FILE), - clinicalVariantFolder.resolve(EtlCommons.HGMD_FILE), - normalize, - genomeSequenceFilePath, assembly, serializer); - } + private Path clinvarFullReleaseFilePath; + private Path clinvarSummaryFilePath; + private Path clinvarVariationAlleleFilePath; + private Path clinvarEFOFilePath; + private Path cosmicFilePath; + private Path hgmdFilePath; + private Path gwasFilePath; + private Path gwasDbSnpFilePath; + + private final CellBaseConfiguration configuration; - public ClinicalVariantBuilder(Path clinvarXMLFile, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, - Path clinvarEFOFile, Path cosmicFile, Path gwasFile, Path dbsnpFile, - Path iarctp53GermlineFile, Path iarctp53GermlineReferencesFile, - Path iarctp53SomaticFile, Path iarctp53SomaticReferencesFile, Path docmFile, Path hgmdFile, - boolean normalize, Path genomeSequenceFilePath, String assembly, - CellBaseSerializer serializer) { + public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath, + String assembly, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); - this.clinvarXMLFile = clinvarXMLFile; - this.clinvarSummaryFile = clinvarSummaryFile; - this.clinvarVariationAlleleFile = clinvarVariationAlleleFile; - this.clinvarEFOFile = clinvarEFOFile; - this.cosmicFile = cosmicFile; - this.gwasFile = gwasFile; - this.dbsnpFile = dbsnpFile; - this.iarctp53GermlineFile = iarctp53GermlineFile; - this.iarctp53GermlineReferencesFile = iarctp53GermlineReferencesFile; - this.iarctp53SomaticFile = iarctp53SomaticFile; - this.iarctp53SomaticReferencesFile = iarctp53SomaticReferencesFile; - this.docmFile = docmFile; - this.hgmdFile = hgmdFile; + this.clinicalVariantPath = clinicalVariantFolder; this.normalize = normalize; this.genomeSequenceFilePath = genomeSequenceFilePath; this.assembly = assembly; + this.configuration = configuration; } - public void parse() throws IOException, RocksDBException { + public void check() throws CellBaseException, IOException { + if (checked) { + return; + } + + logger.info(CHECKING_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + + // Sanity check + checkDirectory(clinicalVariantPath, getDataName(CLINICAL_VARIANT_DATA)); + if (!Files.exists(serializer.getOutdir())) { + try { + Files.createDirectories(serializer.getOutdir()); + } catch (IOException e) { + throw new CellBaseException("Error creating folder " + serializer.getOutdir(), e); + } + } + + // Check genome file + logger.info("Checking genome FASTA file ..."); + if (!Files.exists(genomeSequenceFilePath)) { + throw new CellBaseException("Genome file path does not exist " + genomeSequenceFilePath); + } + logger.info(OK_LOG_MESSAGE); + logger.info("Checking index for genome FASTA file ..."); + getIndexFastaReferenceGenome(genomeSequenceFilePath); + logger.info(OK_LOG_MESSAGE); + + // Check ClinVar files + clinvarFullReleaseFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_FULL_RELEASE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarSummaryFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_SUMMARY_FILE_ID, + clinicalVariantPath).toPath(); + clinvarVariationAlleleFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_ALLELE_FILE_ID, + clinicalVariantPath).toPath(); + clinvarEFOFilePath = checkFile(CLINVAR_DATA, configuration.getDownload().getClinvar(), CLINVAR_EFO_TERMS_FILE_ID, + clinicalVariantPath).toPath(); + + // Check COSMIC file + cosmicFilePath = checkFiles(COSMIC_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check HGMD file + hgmdFilePath = checkFiles(HGMD_DATA, clinicalVariantPath, 1).get(0).toPath(); + + // Check GWAS files + gwasFilePath = checkFiles(GWAS_DATA, clinicalVariantPath, 1).get(0).toPath(); + String dbSnpFilename = Paths.get(configuration.getDownload().getGwasCatalog().getFiles().get(GWAS_DBSNP_FILE_ID)).getFileName() + .toString(); + gwasDbSnpFilePath = clinicalVariantPath.resolve(dbSnpFilename); + if (!Files.exists(gwasDbSnpFilePath)) { + throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbSnpFilename + " is missing at " + + clinicalVariantPath); + } + if (!Files.exists(clinicalVariantPath.resolve(dbSnpFilename + TBI_EXTENSION))) { + throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + dbSnpFilename + TBI_EXTENSION + + " is missing at " + clinicalVariantPath); + } + + logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + checked = true; + } + + public void parse() throws IOException, RocksDBException, CellBaseException { + check(); + + // Prepare ClinVar chunk files before building (if necessary) + Path chunksPath = serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY); + if (Files.notExists(chunksPath)) { + Files.createDirectories(chunksPath); + logger.info("Splitting CliVar file {} in {} ...", clinvarFullReleaseFilePath, chunksPath); + splitClinvar(clinvarFullReleaseFilePath, chunksPath); + logger.info(OK_LOG_MESSAGE); + } RocksDB rdb = null; Options dbOption = null; String dbLocation = null; try { - Object[] dbConnection = getDBConnection(clinvarXMLFile.getParent().toString() + "/integration.idx", true); + Object[] dbConnection = getDBConnection(clinicalVariantPath.toString() + "/integration.idx", true); rdb = (RocksDB) dbConnection[0]; dbOption = (Options) dbConnection[1]; dbLocation = (String) dbConnection[2]; // COSMIC - // IMPORTANT: COSMIC must be indexed first (before ClinVar, IARC TP53, DOCM, HGMD,...)!!! - if (this.cosmicFile != null && Files.exists(this.cosmicFile)) { - CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, normalize, genomeSequenceFilePath, assembly, rdb); - cosmicIndexer.index(); - } else { - logger.warn("Cosmic file {} missing. Skipping Cosmic data", cosmicFile); - } + // IMPORTANT: COSMIC must be indexed first (before ClinVar, HGMD,...)!!! + CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFilePath, configuration.getDownload().getCosmic().getVersion(), + normalize, genomeSequenceFilePath, assembly, rdb); + cosmicIndexer.index(); // ClinVar - if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null - && this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile) - && Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) { - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, - clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb); - clinvarIndexer.index(); - } else { - logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n" - + "Please, ensure that these two files exist:\n" - + "{}\n" - + "{}", this.clinvarXMLFile.toString(), this.clinvarSummaryFile.toString()); - } - - // IARC TP53 - if (this.iarctp53GermlineFile != null && this.iarctp53SomaticFile != null - && Files.exists(iarctp53GermlineFile) && Files.exists(iarctp53SomaticFile)) { - IARCTP53Indexer iarctp53Indexer = new IARCTP53Indexer(iarctp53GermlineFile, - iarctp53GermlineReferencesFile, iarctp53SomaticFile, iarctp53SomaticReferencesFile, - normalize, genomeSequenceFilePath, assembly, rdb); - iarctp53Indexer.index(); - } else { - logger.warn("One or more of required IARCTP53 files are missing. Skipping IARCTP53 data."); - } - - // DOCM - if (this.docmFile != null && Files.exists(docmFile)) { - DOCMIndexer docmIndexer = new DOCMIndexer(docmFile, normalize, genomeSequenceFilePath, assembly, rdb); - docmIndexer.index(); - } else { - logger.warn("The DOCM file {} is missing. Skipping DOCM data.", docmFile); - } + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(serializer.getOutdir().resolve(CLINVAR_CHUNKS_SUBDIRECTORY), + clinvarSummaryFilePath, clinvarVariationAlleleFilePath, clinvarEFOFilePath, configuration.getDownload().getClinvar() + .getVersion(), normalize, genomeSequenceFilePath, assembly, rdb); + clinvarIndexer.index(); // HGMD - if (this.hgmdFile != null && Files.exists(hgmdFile)) { - HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, normalize, genomeSequenceFilePath, assembly, rdb); - hgmdIndexer.index(); - } else { - logger.warn("The HGMD file {} is missing. Skipping HGMD data.", hgmdFile); - } + HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFilePath, configuration.getDownload().getHgmd().getVersion(), normalize, + genomeSequenceFilePath, assembly, rdb); + hgmdIndexer.index(); // GWAS catalog - if (gwasFile != null && Files.exists(gwasFile)) { - if (dbsnpFile != null && Files.exists(dbsnpFile)) { - Path tabixFile = Paths.get(dbsnpFile.toAbsolutePath() + ".tbi"); - if (tabixFile != null && Files.exists(tabixFile)) { - GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbsnpFile, genomeSequenceFilePath, assembly, rdb); - gwasIndexer.index(); - } else { - logger.warn("The dbSNP tabix file {} is missing. Skipping GWAS catalog data.", tabixFile); - } - } else { - logger.warn("The dbSNP file {} is missing. Skipping GWAS catalog data.", dbsnpFile); - } - } else { - logger.warn("The GWAS catalog file {} is missing. Skipping GWAS catalog data.", gwasFile); - } + GwasIndexer gwasIndexer = new GwasIndexer(gwasFilePath, gwasDbSnpFilePath, genomeSequenceFilePath, assembly, rdb); + gwasIndexer.index(); + // Serialize serializeRDB(rdb); closeIndex(rdb, dbOption, dbLocation); serializer.close(); @@ -186,7 +180,6 @@ public void parse() throws IOException, RocksDBException { serializer.close(); throw e; } - } private void serializeRDB(RocksDB rdb) throws IOException { @@ -223,7 +216,7 @@ private Variant parseVariantFromVariantId(String variantId) { return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]); } } catch (Exception e) { - logger.warn(e.getMessage() + ". Impossible to create the variant object from the variant ID: " + variantId); + logger.warn("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId); return null; } } @@ -275,4 +268,53 @@ private Object[] getDBConnection(String dbLocation, boolean forceCreate) { } + private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { + PrintWriter pw = null; + try (BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath)) { + StringBuilder header = new StringBuilder(); + boolean beforeEntry = true; + boolean inEntry = false; + int count = 0; + int chunk = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.trim().startsWith("")) { + inEntry = false; + if (count % 10000 == 0) { + if (pw != null) { + pw.print(""); + pw.close(); + } + chunk++; + } + } + } + if (pw != null) { + pw.print(""); + pw.close(); + } + } finally { + if (pw != null) { + pw.close(); + } + } + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index f8d2f16d15..51be2b6f31 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -37,12 +37,12 @@ public class CosmicIndexer extends ClinicalIndexer { private final Path cosmicFile; + private final String version; private final String assembly; + private Pattern mutationGRCh37GenomePositionPattern; private Pattern snvPattern; - private static final String COSMIC_VERSION = "v95"; - private static final int GENE_NAMES_COLUMN = 0; private static final int HGNC_COLUMN = 3; private static final int PRIMARY_SITE_COLUMN = 7; @@ -84,10 +84,12 @@ public class CosmicIndexer extends ClinicalIndexer { private int rocksDBNewVariants = 0; private int rocksDBUpdateVariants = 0; - public CosmicIndexer(Path cosmicFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { + public CosmicIndexer(Path cosmicFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) + throws IOException { super(genomeSequenceFilePath); this.cosmicFile = cosmicFile; + this.version = version; this.normalize = normalize; this.assembly = assembly; this.rdb = rdb; @@ -469,7 +471,7 @@ private EvidenceEntry buildCosmic(String[] fields) { String id = fields[ID_COLUMN]; String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, COSMIC_VERSION, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, version, null); SomaticInformation somaticInformation = getSomaticInformation(fields); List genomicFeatureList = getGenomicFeature(fields); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java index b77f238432..a150e042dd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/DOCMIndexer.java @@ -178,7 +178,7 @@ private VariantAnnotation parseVariantAnnotation(Map map) { List bibliography = getBibliography(evidenceEntry); bibliography.add(PMID + diseaseMap.get(SOURCE_PUBMED_ID)); } else { - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.DOCM_DATA, null, null); + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.DOCM_NAME, null, null); HeritableTrait heritableTrait = new HeritableTrait((String) diseaseMap.get(DISEASE), null); List genomicFeatureList = getGenomicFeature(map); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java index d2ce12dee8..f132f4b9e8 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java @@ -36,15 +36,17 @@ */ public class HGMDIndexer extends ClinicalIndexer { private final Path hgmdFile; + private final String version; private final String assembly; - public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) + public HGMDIndexer(Path hgmdFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); - this.rdb = rdb; - this.assembly = assembly; this.hgmdFile = hgmdFile; + this.version = version; this.normalize = normalize; + this.assembly = assembly; + this.rdb = rdb; } public void index() throws RocksDBException, IOException { @@ -93,7 +95,7 @@ private void parseHgmdInfo(Variant variant) { } // Source - entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, "2020.3", "2020")); + entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, null)); // Assembly entry.setAssembly(assembly); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java new file mode 100644 index 0000000000..f6183e3040 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java @@ -0,0 +1,68 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders.utils; + +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class RocksDBUtils { + + public static void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { + if (rdb != null) { + rdb.close(); + } + if (dbOption != null) { + dbOption.dispose(); + } + if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { + org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); + } + } + + public static Object[] getDBConnection(String dbLocation, boolean forceCreate) throws RocksDBException { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + Options options = new Options().setCreateIfMissing(true); + +// options.setMaxBackgroundCompactions(4); +// options.setMaxBackgroundFlushes(1); +// options.setCompressionType(CompressionType.NO_COMPRESSION); +// options.setMaxOpenFiles(-1); +// options.setIncreaseParallelism(4); +// options.setCompactionStyle(CompactionStyle.LEVEL); +// options.setLevelCompactionDynamicLevelBytes(true); + + RocksDB db; + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index a4ade6603e..7ac8bcf800 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -24,10 +24,11 @@ import com.fasterxml.jackson.databind.ObjectWriter; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; @@ -46,11 +47,15 @@ import java.time.LocalDateTime; import java.util.*; -public class AbstractDownloadManager { +import static org.opencb.cellbase.lib.EtlCommons.*; - private static final String DGV_NAME = "DGV"; +public abstract class AbstractDownloadManager { - private static final String GNOMAD_NAME = "gnomAD"; + protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} ..."; + protected static final String DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}"; + protected static final String CATEGORY_DOWNLOADING_LOG_MESSAGE = "Downloading {}/{} ..."; + protected static final String CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE = "Ok. {}/{}"; + protected static final String DOWNLOADING_FROM_TO_LOG_MESSAGE = "Downloading {} to {} ..."; protected String species; protected String assembly; @@ -66,15 +71,23 @@ public class AbstractDownloadManager { protected Path downloadFolder; protected Path downloadLogFolder; // /download/log protected Path buildFolder; // /_/generated-json + + protected ObjectReader dataSourceReader; + protected ObjectWriter dataSourceWriter; + protected Logger logger; - public AbstractDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) + protected AbstractDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { this.species = species; this.assembly = assembly; this.outdir = outdir; this.configuration = configuration; + ObjectMapper jsonObjectMapper = new ObjectMapper(); + this.dataSourceReader = jsonObjectMapper.readerFor(DataSource.class); + this.dataSourceWriter = jsonObjectMapper.writerFor(DataSource.class); + this.init(); } @@ -104,47 +117,22 @@ private void init() throws CellBaseException, IOException { // Prepare outdir Path speciesFolder = outdir.resolve(speciesShortName + "_" + assemblyConfiguration.getName().toLowerCase()); downloadFolder = outdir.resolve(speciesFolder + "/download"); - logger.info("Creating download dir " + downloadFolder.toString()); + logger.info("Creating download dir {}", downloadFolder); Files.createDirectories(downloadFolder); downloadLogFolder = outdir.resolve(speciesFolder + "/download/log"); - logger.info("Creating download log dir " + downloadLogFolder.toString()); + logger.info("Creating download log dir {}", downloadLogFolder); Files.createDirectories(downloadLogFolder); // /_/generated_json buildFolder = outdir.resolve(speciesFolder + "/generated_json"); - logger.info("Creating build dir " + buildFolder.toString()); + logger.info("Creating build dir {}", buildFolder); Files.createDirectories(buildFolder); - logger.info("Processing species " + speciesConfiguration.getScientificName()); + logger.info("Processing species {}", speciesConfiguration.getScientificName()); } - public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { - return null; - } - -// public DownloadFile downloadStructuralVariants() throws IOException, InterruptedException { -// if (!speciesHasInfoToDownload(speciesConfiguration, "svs")) { -// return null; -// } -// if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { -// logger.info("Downloading DGV data ..."); -// -// Path structuralVariantsFolder = downloadFolder.resolve(EtlCommons.STRUCTURAL_VARIANTS_FOLDER); -// Files.createDirectories(structuralVariantsFolder); -// String sourceFilename = (assemblyConfiguration.getName().equalsIgnoreCase("grch37") ? "GRCh37_hg19" : "GRCh38_hg38") -// + "_variants_2016-05-15.txt"; -// String url = configuration.getDownload().getDgv().getHost() + "/" + sourceFilename; -// saveVersionData(EtlCommons.STRUCTURAL_VARIANTS_DATA, DGV_NAME, getDGVVersion(sourceFilename), getTimeStamp(), -// Collections.singletonList(url), structuralVariantsFolder.resolve(EtlCommons.DGV_VERSION_FILE)); -// return downloadFile(url, structuralVariantsFolder.resolve(EtlCommons.DGV_FILE).toString()); -// } -// return null; -// } - -// private String getDGVVersion(String sourceFilename) { -// return sourceFilename.split("\\.")[0].split("_")[3]; -// } + public abstract List download() throws IOException, InterruptedException, CellBaseException; protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) { boolean hasInfo = true; @@ -155,32 +143,148 @@ protected boolean speciesHasInfoToDownload(SpeciesConfiguration sp, String info) return hasInfo; } - protected String getTimeStamp() { - return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String data, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String versionFilename = getDataVersionFilename(data); + + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + @Deprecated + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveDataSource(props, fileId, name, category, null, versionFilename, outPath); + } + + @Deprecated + protected DownloadFile downloadAndSaveDataSource(DownloadProperties.URLProperties props, String fileId, String name, String category, + String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadDataSource(props, fileId, chromosome, outPath); + + // Save data source + saveDataSource(name, category, props.getVersion(), getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + Path outPath) throws IOException, InterruptedException, CellBaseException { + return downloadAndSaveEnsemblDataSource(ensemblProps, fileId, data, null, outPath); + } + + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String data, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(data, "(" + getDataName(ENSEMBL_DATA) + " " + ensemblVersion + ")", getTimeStamp(), + Collections.singletonList(downloadFile.getUrl()), outPath.resolve(getDataVersionFilename(data))); + + return downloadFile; + } + + @Deprecated + protected DownloadFile downloadAndSaveEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String name, + String category, String chromosome, String versionFilename, Path outPath) + throws IOException, InterruptedException, CellBaseException { + // Download file + DownloadFile downloadFile = downloadEnsemblDataSource(ensemblProps, fileId, chromosome, outPath); + + // Save data source + saveDataSource(name, category, "(Ensembl " + ensemblVersion + ")", getTimeStamp(), Collections.singletonList(downloadFile.getUrl()), + outPath.resolve(versionFilename)); + + return downloadFile; + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadDataSource(props, fileId, null, outPath); + } + + protected DownloadFile downloadDataSource(DownloadProperties.URLProperties props, String fileId, + String chromosome, Path outPath) + throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getUrl(props, fileId, species, assembly, chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; } - protected void saveVersionData(String data, String name, String version, String date, List url, Path outputFilePath) + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, Path outPath) + throws IOException, InterruptedException, CellBaseException { + return downloadEnsemblDataSource(ensemblProps, fileId, null, outPath); + } + + protected DownloadFile downloadEnsemblDataSource(DownloadProperties.EnsemblProperties ensemblProps, String fileId, String chromosome, + Path outPath) throws IOException, InterruptedException, CellBaseException { + String url = EtlCommons.getEnsemblUrl(ensemblProps, ensemblRelease, fileId, speciesShortName, assemblyConfiguration.getName(), + chromosome); + File outFile = outPath.resolve(getFilenameFromUrl(url)).toFile(); + logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, outFile); + DownloadFile downloadFile = downloadFile(url, outFile.toString()); + logger.info(OK_LOG_MESSAGE); + return downloadFile; + } + + protected void saveDataSource(String data, String version, String date, List urls, Path versionFilePath) + throws IOException, CellBaseException { + String name = getDataName(data); + String category = getDataCategory(data); + DataSource dataSource = new DataSource(name, category, version, date, urls); + + if (StringUtils.isEmpty(version)) { + logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); + dataSource.setVersion(date); + } + + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + @Deprecated + protected void saveDataSource(String name, String category, String version, String date, List urls, Path versionFilePath) throws IOException { - Map versionDataMap = new HashMap<>(); - versionDataMap.put("data", data); - versionDataMap.put("name", name); - versionDataMap.put("version", version); - versionDataMap.put("date", date); - versionDataMap.put("url", url); + DataSource dataSource = new DataSource(name, category, version, date, urls); - ObjectMapper jsonObjectMapper = new ObjectMapper(); - jsonObjectMapper.writeValue(outputFilePath.toFile(), versionDataMap); + if (StringUtils.isEmpty(version)) { + logger.warn("Version missing for data source {}/{}, using the date as version: {}", category, name, date); + dataSource.setVersion(date); + } + + dataSourceWriter.writeValue(versionFilePath.toFile(), dataSource); + } + + protected String getTimeStamp() { + return new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime()); } protected String getLine(Path readmePath, int lineNumber) { Files.exists(readmePath); - try { - BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset()); + try (BufferedReader reader = Files.newBufferedReader(readmePath, Charset.defaultCharset())) { String line = null; for (int i = 0; i < lineNumber; i++) { line = reader.readLine(); } - reader.close(); return line; } catch (IOException e) { e.printStackTrace(); @@ -216,14 +320,12 @@ protected String getPhylo(SpeciesConfiguration sp) { } } - - - protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException { + protected DownloadFile downloadFile(String url, String outputFileName) throws IOException, InterruptedException, CellBaseException { return downloadFile(url, outputFileName, null); } protected DownloadFile downloadFile(String url, String outputFileName, List wgetAdditionalArgs) - throws IOException, InterruptedException { + throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFileInfo = new DownloadFile(url, outputFileName, Timestamp.valueOf(LocalDateTime.now()).toString()); Long startTime = System.currentTimeMillis(); if (Paths.get(outputFileName).toFile().exists()) { @@ -251,7 +353,7 @@ private void setDownloadStatusAndMessage(String outputFileName, DownloadFile dow } else { downloadFile.setStatus(DownloadFile.Status.ERROR); downloadFile.setMessage("Expected downloaded file size " + downloadFile.getExpectedFileSize() - + ", Actual file size " + downloadFile.getActualFileSize()); + + ", Actual file size " + downloadFile.getActualFileSize()); } } else { downloadFile.setMessage("See full error message in " + outputLog); @@ -277,54 +379,42 @@ private boolean validateDownloadFile(DownloadFile downloadFile, String outputFil private long getExpectedFileSize(String outputFileLog) { try (BufferedReader reader = new BufferedReader(new FileReader(outputFileLog))) { - String line = null; + String line; while ((line = reader.readLine()) != null) { // looking for: Length: 13846591 (13M) if (line.startsWith("Length:")) { String[] parts = line.split("\\s"); - return Long.valueOf(parts[1]); + return Long.parseLong(parts[1]); } } } catch (Exception e) { - logger.info("Error getting expected file size " + e.getMessage()); + logger.info("Error getting expected file size {}", e.getMessage()); } return -1; } - protected String getVersionFromVersionLine(Path path, String tag) { - Files.exists(path); - try { - BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset()); - String line = reader.readLine(); - // There shall be a line at the README.txt containing the version. - // e.g. The files in the current directory contain the data corresponding to the latest release - // (version 4.0, April 2016). ... - while (line != null) { - // tag specifies a certain string that must be found within the line supposed to contain the version - // info - if (line.contains(tag)) { - String version = line.split("\\(")[1].split("\\)")[0]; - reader.close(); - return version; - } - line = reader.readLine(); - } - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - private String getEnsemblURL(SpeciesConfiguration sp) { // We need to find which is the correct Ensembl host URL. // This can different depending on if is a vertebrate species. - String ensemblHostUrl; if (configuration.getSpecies().getVertebrates().contains(sp)) { - ensemblHostUrl = configuration.getDownload().getEnsembl().getUrl().getHost(); + return configuration.getDownload().getEnsembl().getUrl().getHost(); + } else { + return configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); + } + } + + @Deprecated + protected String getUrl(DownloadProperties.URLProperties props, String fileId) throws CellBaseException { + if (!props.getFiles().containsKey(fileId)) { + throw new CellBaseException("File ID " + fileId + " is missing in the DownloadProperties.URLProperties within the CellBase" + + " configuration file"); + } + String filesValue = props.getFiles().get(fileId); + if (filesValue.startsWith("https://") || filesValue.startsWith("http://") || filesValue.startsWith("ftp://")) { + return filesValue; } else { - ensemblHostUrl = configuration.getDownload().getEnsemblGenomes().getUrl().getHost(); + return props.getHost() + filesValue; } - return ensemblHostUrl; } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java new file mode 100644 index 0000000000..721a02b599 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AlphaMissenseDownloadManager.java @@ -0,0 +1,53 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.EtlCommons; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class AlphaMissenseDownloadManager extends AbstractDownloadManager { + + public AlphaMissenseDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ALPHAMISSENSE_DATA)); + + Path alphaMissensePath = downloadFolder.resolve(EtlCommons.PROTEIN_SUBSTITUTION_PREDICTION_DATA); + Files.createDirectories(alphaMissensePath); + + // Download AlphaMissense file + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getAlphaMissense(), ALPHAMISSENSE_FILE_ID, + ALPHAMISSENSE_DATA, alphaMissensePath); + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ALPHAMISSENSE_DATA)); + + return Collections.singletonList(downloadFile); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java index e0cae1250e..0b0d09f412 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/CaddDownloadManager.java @@ -18,7 +18,6 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import java.io.IOException; import java.nio.file.Files; @@ -26,36 +25,36 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class CaddDownloadManager extends AbstractDownloadManager { - private static final String CADD_NAME = "CADD"; public CaddDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, targetDirectory, configuration); } @Override - public List download() throws IOException, InterruptedException { - return Collections.singletonList(downloadCaddScores()); - } - - public DownloadFile downloadCaddScores() throws IOException, InterruptedException { - if (!speciesHasInfoToDownload(speciesConfiguration, "variation_functional_score")) { - return null; + public List download() throws IOException, InterruptedException, CellBaseException { + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_FUNCTIONAL_SCORE_DATA) + || !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{}/{} not supported for species {}", getDataCategory(CADD_DATA), getDataName(CADD_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading CADD scores information ..."); - Path variationFunctionalScoreFolder = downloadFolder.resolve("variation_functional_score"); - Files.createDirectories(variationFunctionalScoreFolder); + // Create the CADD download path + Path caddDownloadPath = downloadFolder.resolve(VARIATION_FUNCTIONAL_SCORE_DATA).resolve(CADD_DATA); + Files.createDirectories(caddDownloadPath); - // Downloads CADD scores - String url = configuration.getDownload().getCadd().getHost(); + // Download CADD and save data source + DownloadFile downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCadd(), CADD_FILE_ID, CADD_DATA, + caddDownloadPath); - saveVersionData(EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, CADD_NAME, url.split("/")[5], getTimeStamp(), - Collections.singletonList(url), variationFunctionalScoreFolder.resolve("caddVersion.json")); - return downloadFile(url, variationFunctionalScoreFolder.resolve("whole_genome_SNVs.tsv.gz").toString()); - } - return null; + logger.info(CATEGORY_DOWNLOADING_DONE_LOG_MESSAGE, getDataCategory(CADD_DATA), getDataName(CADD_DATA)); + + return Collections.singletonList(downloadFile); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 580a855a19..9fd0e7562c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -20,26 +20,18 @@ import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.EtlCommons; -import org.opencb.commons.utils.FileUtils; -import javax.ws.rs.client.Client; -import javax.ws.rs.client.ClientBuilder; -import javax.ws.rs.client.WebTarget; -import java.io.*; -import java.net.URI; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -public class ClinicalDownloadManager extends AbstractDownloadManager { - - private static final String CLINVAR_NAME = "ClinVar"; - private static final String GWAS_NAME = "GWAS catalog"; - private static final String IARCTP53_NAME = "IARC TP53 Database"; +import static org.opencb.cellbase.lib.EtlCommons.*; +public class ClinicalDownloadManager extends AbstractDownloadManager { public ClinicalDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -47,204 +39,63 @@ public ClinicalDownloadManager(String species, String assembly, Path outdir, Cel } @Override - public List download() throws IOException, InterruptedException { + public List download() throws IOException, InterruptedException, CellBaseException { List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadClinical()); return downloadFiles; } - public List downloadClinical() throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { -// if (assemblyConfiguration.getName() == null) { -// throw new ParameterException("Assembly must be provided for downloading clinical variants data." -// + " Please, specify either --assembly GRCh37 or --assembly GRCh38"); -// } - - logger.info("Downloading clinical information ..."); - - String url; - List downloadFiles = new ArrayList<>(); - - Path clinicalFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER); - Files.createDirectories(clinicalFolder); - logger.info("\t\tDownloading ClinVar files ..."); - - List clinvarUrls = new ArrayList<>(3); - url = configuration.getDownload().getClinvar().getHost(); - - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarEfoTerms().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_EFO_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarSummary().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE).toString())); - clinvarUrls.add(url); - - url = configuration.getDownload().getClinvarVariationAllele().getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); - clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); - - // Gwas catalog - logger.info("\t\tDownloading GWAS catalog file ..."); - DownloadProperties.URLProperties gwasCatalog = configuration.getDownload().getGwasCatalog(); - url = gwasCatalog.getHost(); - downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), - Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); - -// List hgvsList = getDocmHgvsList(); -// if (!hgvsList.isEmpty()) { -// downloadDocm(hgvsList, clinicalFolder.resolve(EtlCommons.DOCM_FILE)); -// downloadFiles.add(downloadFile(configuration.getDownload().getDocmVersion().getHost(), -// clinicalFolder.resolve("docmIndex.html").toString())); -// saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.DOCM_NAME, -// getDocmVersion(clinicalFolder.resolve("docmIndex.html")), getTimeStamp(), -// Arrays.asList(configuration.getDownload().getDocm().getHost() + "v1/variants.json", -// configuration.getDownload().getDocm().getHost() + "v1/variants/{hgvs}.json"), -// clinicalFolder.resolve("docmVersion.json")); -// } else { -// logger.warn("No DOCM variants found for assembly {}. Please double-check that this is the correct " -// + "assembly", assemblyConfiguration.getName()); -// } - - // I am only able to download these files manually -// if (assemblyConfiguration.getName().equalsIgnoreCase("grch38")) { -// url = configuration.getDownload().getIarctp53().getHost(); -// downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.IARCTP53_FILE).toString())); -// -// ZipFile zipFile = new ZipFile(clinicalFolder.resolve(EtlCommons.IARCTP53_FILE).toString()); -// Enumeration entries = zipFile.entries(); -// while (entries.hasMoreElements()) { -// ZipEntry entry = entries.nextElement(); -// File entryDestination = new File(clinicalFolder.toFile(), entry.getName()); -// if (entry.isDirectory()) { -// entryDestination.mkdirs(); -// } else { -// entryDestination.getParentFile().mkdirs(); -// InputStream in = zipFile.getInputStream(entry); -// OutputStream out = new FileOutputStream(entryDestination); -// IOUtils.copy(in, out); -// IOUtils.closeQuietly(in); -// out.close(); -// } -// } -// saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, IARCTP53_NAME, -// getVersionFromVersionLine(clinicalFolder.resolve("Disclaimer.txt"), -// "The version of the database should be identified"), getTimeStamp(), -// Collections.singletonList(url), clinicalFolder.resolve("iarctp53Version.json")); -// } - - if (Files.notExists(clinicalFolder.resolve("clinvar_chunks"))) { - Files.createDirectories(clinicalFolder.resolve("clinvar_chunks")); - splitClinvar(clinicalFolder.resolve(EtlCommons.CLINVAR_XML_FILE), clinicalFolder.resolve("clinvar_chunks")); - } - - return downloadFiles; - } - return null; - } - - private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException { - BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath); - PrintWriter pw = null; - StringBuilder header = new StringBuilder(); - boolean beforeEntry = true; - boolean inEntry = false; - int count = 0; - int chunk = 0; - String line; - while ((line = br.readLine()) != null) { - if (line.trim().startsWith("")) { - inEntry = false; - if (count % 10000 == 0) { - pw.print(""); - pw.close(); - chunk++; - } - } + public List downloadClinical() throws IOException, InterruptedException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CLINICAL_VARIANT_DATA)); + if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + logger.info("{} not supported for the species {}", getDataName(CLINICAL_VARIANT_DATA), + speciesConfiguration.getScientificName()); + return Collections.emptyList(); } - pw.print(""); - pw.close(); - br.close(); - } - private String getDocmVersion(Path docmIndexHtml) { - return getVersionFromVersionLine(docmIndexHtml, "