diff --git a/conf/test.config b/conf/test.config index 8878bf57..c826b407 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,7 +28,7 @@ params { input = "${projectDir}/assets/test/samplesheet_s3.csv" // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.phiXspike.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz" accession = "GCA_922984935.2" taxon = "Meles meles" diff --git a/conf/test_full.config b/conf/test_full.config index a86e0050..d31328dc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -20,7 +20,7 @@ params { input = "${projectDir}/assets/test_full/full_samplesheet.csv" // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz" accession = "GCA_927399515.1" taxon = "Laetiporus sulphureus" diff --git a/conf/test_raw.config b/conf/test_raw.config index 14325e44..b93811a0 100644 --- a/conf/test_raw.config +++ b/conf/test_raw.config @@ -29,7 +29,7 @@ params { align = true // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz" accession = "GCA_922984935.2" taxon = "Meles meles" diff --git a/docs/usage.md b/docs/usage.md index 0bd4a94a..b09fb91a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -101,6 +101,43 @@ For instance: --busco path-to-databases/busco/ --busco_lineages vertebrata_odb10,bacteria_odb10,fungi_odb10 ``` +### BUSCO database path format + +**Important**: The `--busco` parameter must be a directory containing the `lineages/` subdirectory, **NOT** to the `lineages/` directory itself. BUSCO databases are always directories, never individual files. + +```bash +# ✅ Correct - points to the parent directory +--busco /path/to/busco_downloads/ + +# ❌ Common mistake - includes /lineages at the end +--busco /path/to/busco_downloads/lineages/ + +# ❌ Another common mistake - points to a specific lineage +--busco /path/to/busco_downloads/lineages/eukaryota_odb10/ +``` + +The pipeline will automatically detect and correct paths ending with `/lineages` or pointing to specific lineage directories (e.g., `eukaryota_odb10`) to prevent common errors where BUSCO tries to access incorrect paths. + +### BLAST database path formats + +The `--blastn` parameter accepts two formats: + +1. **Directory path** (for backwards compatibility): + + ```bash + --blastn /path/to/databases/nt_2024_10/ + ``` + + This works only if the directory contains a single BLAST database. + +2. **Direct file path** (recommended for clarity): + ```bash + --blastn /path/to/databases/nt_2024_10/nt.nal + ``` + This is required if your database directory contains multiple BLAST databases. Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available. + +If multiple databases are found in a directory, the pipeline will fail with a clear error message listing all available databases and suggesting the exact file paths to use. + ### Getting databases ready for the pipeline The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases: @@ -152,17 +189,37 @@ wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ && for file in $NT/*.tar.gz; do tar xf $file -C $NT && rm $file; done +``` wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" && tar xf taxdb.tar.gz -C $NT && rm taxdb.tar.gz # Compress and cleanup + cd .. tar -cvzf $NT_TAR $NT rm -r $NT + +```` + +##### Important: Handling directories with multiple BLAST databases + +If your database directory contains multiple BLAST databases (e.g., both `nt` and `nr` databases), you must specify the exact path to the `.nal` file to avoid ambiguity: + +```bash +# ❌ This will fail if multiple databases are present +--blastn /path/to/databases/ + +# ✅ Specify the exact database file +--blastn /path/to/databases/nt.nal ``` +The pipeline supports two formats for the `--blastn` parameter: + +- **Directory path**: `/path/to/databases/nt_2024_10/` (only works if directory contains a single BLAST database) +- **Direct file path**: `/path/to/databases/nt_2024_10/nt.nal` (recommended for directories with multiple databases). Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available. + #### 3. UniProt reference proteomes database You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step. @@ -177,7 +234,7 @@ UNIPROT=/path/to/databases/uniprot_${DATE} UNIPROT_TAR=/path/to/databases/uniprot_${DATE}.tar.gz mkdir -p $UNIPROT cd $UNIPROT -``` +```` The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (close to 200 GB) and will take a long time to download. The command below looks complex because it needs to get around the problem of using wildcards with wget and curl. diff --git a/nextflow.config b/nextflow.config index 2db7458f..8e322df8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,7 @@ params { busco_lineages = null precomputed_busco = null busco_gene_predictor = null + ntdb_prefix = null // Reference options fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index e83c29b1..8f4f5498 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -108,8 +108,8 @@ "format": "file-path", "exists": true, "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", + "pattern": "^\\S+\\.fa(\\.gz)?$", + "description": "Path to FASTA genome file (must have .fa extension).", "fa_icon": "far fa-file-code" } } @@ -124,7 +124,8 @@ "busco": { "type": "string", "format": "path", - "description": "Local directory where clade-specific BUSCO lineage datasets are stored", + "description": "Local directory where clade-specific BUSCO lineage datasets are stored. Must be a directory containing the 'lineages/' subdirectory.", + "help_text": "BUSCO databases are always directories, never individual files. Do NOT include '/lineages' at the end of the path or point to specific lineage directories. The pipeline will automatically correct paths ending with '/lineages' or pointing to specific lineages. Example: '/path/to/busco_downloads/' not '/path/to/busco_downloads/lineages/' or '/path/to/busco_downloads/lineages/eukaryota_odb10/'", "fa_icon": "fas fa-folder-open" }, "lineage_tax_ids": { @@ -155,9 +156,16 @@ "type": "string", "format": "path", "exists": true, - "description": "Path to the nucleotide BLAST database", + "description": "Path to the nucleotide BLAST database. Can be either a directory containing a single database or a direct path to a .nal file.", + "help_text": "For directories with multiple databases, specify the exact .nal file path to avoid ambiguity. When a .nal file is specified, the parent directory is used to ensure all database files are available. Example: '/path/to/databases/nt.nal' instead of '/path/to/databases/'", "fa_icon": "fas fa-file-archive" }, + "ntdb_prefix": { + "type": "string", + "description": "Database basename (prefix) to select when --blastn is a directory. Provide the name without extension, e.g. 'nt' for 'nt.nal'.", + "help_text": "When supplying a directory to --blastn, set --ntdb_prefix to identify the database basename (without the .nal extension). The pipeline will look for a file named .nal inside that directory and use it. Example: --blastn /path/to/databases/ --ntdb_prefix nt.", + "fa_icon": "fas fa-tag" + }, "taxdump": { "type": "string", "format": "path", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 32507e01..70a5ceb7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,8 +3,6 @@ // include { samplesheetToList } from 'plugin/nf-schema' - - include { UNTAR } from '../../modules/nf-core/untar/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' @@ -45,6 +43,14 @@ workflow INPUT_CHECK { .map { db_meta, db_path -> if (db_meta.type in ["blastp", "blastx"] && db_path.isDirectory()) { [db_meta, file(db_path.toString() + "/${db_path.name}", checkIfExists: true)] + } else if (db_meta.type == "blastn") { + // Special handling for BLAST nucleotide databases + def (resolved_path, db_name) = validateBlastnDatabase(db_path) + [db_meta, resolved_path] + } else if (db_meta.type == "busco") { + // Special handling for BUSCO databases + def resolved_path = validateBuscoDatabase(db_path) + [db_meta, resolved_path] } else { [db_meta, db_path] } @@ -272,3 +278,180 @@ def get_read_counts ( stats ) { return read_count_meta } + +/* + * Function to validate and resolve BUSCO database paths + * Handles the common user error of including '/lineages' at the end of the path + */ +def validateBuscoDatabase(db_path) { + def path_file = file(db_path) + if (path_file.isDirectory()) { + // Check if path ends with /lineages and has a parent directory + if (path_file.name == 'lineages' && path_file.parent != null) { + def parent_dir = file(path_file.parent) + log.info "BUSCO path correction: Detected '/lineages' suffix in path" + log.info " Original path: ${path_file}" + log.info " Corrected path: ${parent_dir}" + log.info "This prevents the common error where BUSCO tries to use '${path_file}/lineages/lineage_name' instead of '${parent_dir}/lineages/lineage_name'" + return parent_dir + } + // Check if path points to a specific lineage directory (e.g., eukaryota_odb10) + else if (path_file.name.endsWith('_odb10') && path_file.parent != null) { + def parent_dir = file(path_file.parent) + // Check if parent is 'lineages' - if so, we need to go up two levels + if (parent_dir.name == 'lineages' && parent_dir.parent != null) { + def busco_root = file(parent_dir.parent) + log.info "BUSCO path correction: Detected specific lineage directory in path" + log.info " Original path: ${path_file} (specific lineage: ${path_file.name})" + log.info " Corrected path: ${busco_root}" + log.info "This prevents the error where BUSCO tries to use a specific lineage directory instead of the root BUSCO database directory" + log.warn "Use `--busco_lineages ${path_file.name}` to control the lineage" + return busco_root + } else { + error """ + ERROR: Invalid BUSCO lineage directory structure: ${path_file} + It appears you're pointing to a specific BUSCO lineage directory (${path_file.name}), + but the expected directory structure is: + /path/to/busco_downloads/lineages/${path_file.name}/ + Please provide the path to the root BUSCO database directory. + Example: --busco /path/to/busco_downloads/ + """ + } + } else { + // Path looks correct, return as-is + log.info "Using BUSCO database path: ${path_file}" + return path_file + } + } else { + error """ + ERROR: Invalid BUSCO database path: ${path_file} + BUSCO databases must be directories containing the 'lineages/' subdirectory. + Please ensure the path points to a valid BUSCO database directory. + Common issues: + - Path should point to the directory containing 'lineages/' subdirectory + - Do NOT include '/lineages' at the end of the path + - Do NOT point to a specific lineage directory (e.g., eukaryota_odb10) + - BUSCO databases cannot be individual files + Example: --busco /path/to/busco_downloads/ + NOT: --busco /path/to/busco_downloads/lineages/ + NOT: --busco /path/to/busco_downloads/lineages/eukaryota_odb10/ + """ + } +} + +/* + * Function to validate and resolve BLAST nucleotide database paths + * Handles both directory paths (for backwards compatibility) and direct .nal file paths + */ +def validateBlastnDatabase(db_path) { + def path_file = file(db_path) + if (path_file.isFile()) { + // Direct file provided - validate it's a .nal file and create isolated directory + if (path_file.name.endsWith('.nal')) { + if (!path_file.exists()) { + error """ + ERROR: BLAST database file not found: ${path_file} + Please check that the path is correct and the file exists. + """ + } + def parent_dir = file(path_file.parent) + def db_name = path_file.name.replaceAll('\\.nal$', '') + + // Create a temporary directory in the system temp folder with a UUID to avoid + // writing into the database parent directory + def uuid = java.util.UUID.randomUUID().toString() + // Create isolated directory inside the pipeline working directory + def temp_dir = file("${System.getProperty('user.dir')}/.btk_isolated_${db_name}_${uuid}") + if (!temp_dir.exists()) { + temp_dir.mkdirs() + } + + // Find all files belonging to this specific database + def db_files = parent_dir.listFiles().findAll { + it.name.startsWith("${db_name}.") || + it.name in ['taxdb.btd', 'taxdb.bti', 'taxonomy4blast.sqlite3'] + } + + // Create symlinks in the temporary directory + db_files.each { source_file -> + def link_file = file("${temp_dir}/${source_file.name}") + if (!link_file.exists()) { + // Create symbolic link + link_file.createLink(source_file) + } + } + + log.info "Direct BLAST database file specified: ${path_file}" + log.info "Database name: ${db_name}" + log.info "Created isolated directory: ${temp_dir}" + log.info "This ensures only the specified database is available to BLAST" + return [temp_dir, db_name] + } else { + error """ + ERROR: Invalid BLAST database file: ${path_file} + The file must have a .nal extension. + Please provide either: + - A directory containing a single BLAST database + - The direct path to a .nal file + Example: --blastn /path/to/databases/nt.nal + """ + } + } else if (path_file.isDirectory()) { + // Directory provided - require the user to specify the database prefix + log.info "BLAST database directory provided: ${path_file}" + def prefix = (this.binding.hasVariable('params') && params.containsKey('ntdb_prefix')) ? params.ntdb_prefix : null + if (!prefix) { + error """ + ERROR: A BLAST database directory was provided (${path_file}) but no database prefix was supplied. + The pipeline requires you to select which database inside the directory to use. + Please provide the database prefix (basename without extension) using --ntdb_prefix. + Example: --blastn ${path_file} --ntdb_prefix nt (will select ${path_file}/nt.nal) + """ + } + + // Look for the requested .nal file inside the directory + def expected_name = "${prefix}.nal" + def expected_file = path_file.listFiles().find { it.name == expected_name } + if (!expected_file) { + error """ + ERROR: Requested BLAST database prefix '${prefix}' not found in ${path_file} + Expected file: ${path_file}/${expected_name} + Please ensure the prefix passed with --ntdb_prefix matches a .nal file in the directory. + """ + } + + // Create isolated directory with symlinks to the chosen database files + def parent_dir = file(expected_file.parent) + def db_name = expected_file.name.replaceAll('\\.nal$', '') + def uuid = java.util.UUID.randomUUID().toString() + // Create isolated directory inside the pipeline working directory + def temp_dir = file("${System.getProperty('user.dir')}/.btk_isolated_${db_name}_${uuid}") + if (!temp_dir.exists()) { + temp_dir.mkdirs() + } + + def db_files = parent_dir.listFiles().findAll { + it.name.startsWith("${db_name}.") || + it.name in ['taxdb.btd', 'taxdb.bti', 'taxonomy4blast.sqlite3'] + } + + db_files.each { source_file -> + def link_file = file("${temp_dir}/${source_file.name}") + if (!link_file.exists()) { + link_file.createLink(source_file) + } + } + + log.info "Using BLAST database '${db_name}' from directory: ${path_file}" + log.info "Created isolated directory: ${temp_dir}" + return [temp_dir, db_name] + } else { + error """ + ERROR: Invalid database path: ${path_file} + The path must point to either: + - A directory containing a single BLAST database + - A direct path to a .nal file + Example: --blastn /path/to/databases/nt.nal + """ + } +}