sanger-tol · yumisims · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 23, 2025
diff --git a/docs/usage.md b/docs/usage.md
@@ -101,6 +101,43 @@ For instance:
 --busco path-to-databases/busco/ --busco_lineages vertebrata_odb10,bacteria_odb10,fungi_odb10
 ```
 
+### BUSCO database path format
+
+**Important**: The `--busco` parameter must be a directory containing the `lineages/` subdirectory, **NOT** to the `lineages/` directory itself. BUSCO databases are always directories, never individual files.
+
+```bash
+# ✅ Correct - points to the parent directory
+--busco /path/to/busco_downloads/
+
+# ❌ Common mistake - includes /lineages at the end
+--busco /path/to/busco_downloads/lineages/
+
+# ❌ Another common mistake - points to a specific lineage
+--busco /path/to/busco_downloads/lineages/eukaryota_odb10/
+```
+
+The pipeline will automatically detect and correct paths ending with `/lineages` or pointing to specific lineage directories (e.g., `eukaryota_odb10`) to prevent common errors where BUSCO tries to access incorrect paths.
+
+### BLAST database path formats
+
+The `--blastn` parameter accepts two formats:
+
+1. **Directory path** (for backwards compatibility):
+
+   ```bash
+   --blastn /path/to/databases/nt_2024_10/
+   ```
+
+   This works only if the directory contains a single BLAST database.
+
+2. **Direct file path** (recommended for clarity):
+   ```bash
+   --blastn /path/to/databases/nt_2024_10/nt.nal
+   ```
+   This is required if your database directory contains multiple BLAST databases. Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available.
+
+If multiple databases are found in a directory, the pipeline will fail with a clear error message listing all available databases and suggesting the exact file paths to use.
+
 ### Getting databases ready for the pipeline
 
 The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases:
@@ -152,17 +189,37 @@ wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ &&
 for file in $NT/*.tar.gz; do
     tar xf $file -C $NT && rm $file;
 done
+```
 
 wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" &&
 tar xf taxdb.tar.gz -C $NT &&
 rm taxdb.tar.gz
 
 # Compress and cleanup
+
 cd ..
 tar -cvzf $NT_TAR $NT
 rm -r $NT
+
+````
+
+##### Important: Handling directories with multiple BLAST databases
+
+If your database directory contains multiple BLAST databases (e.g., both `nt` and `nr` databases), you must specify the exact path to the `.nal` file to avoid ambiguity:
+
+```bash
+# ❌ This will fail if multiple databases are present
+--blastn /path/to/databases/
+
+# ✅ Specify the exact database file
+--blastn /path/to/databases/nt.nal
 ```
 
+The pipeline supports two formats for the `--blastn` parameter:
+
+- **Directory path**: `/path/to/databases/nt_2024_10/` (only works if directory contains a single BLAST database)
+- **Direct file path**: `/path/to/databases/nt_2024_10/nt.nal` (recommended for directories with multiple databases). Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available.
+
 #### 3. UniProt reference proteomes database
 
 You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step.
@@ -177,7 +234,7 @@ UNIPROT=/path/to/databases/uniprot_${DATE}
 UNIPROT_TAR=/path/to/databases/uniprot_${DATE}.tar.gz
 mkdir -p $UNIPROT
 cd $UNIPROT
-```
+````
 
 The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (close to 200 GB) and will take a long time to download.
 The command below looks complex because it needs to get around the problem of using wildcards with wget and curl.

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -124,7 +124,8 @@
                 "busco": {
                     "type": "string",
                     "format": "path",
-                    "description": "Local directory where clade-specific BUSCO lineage datasets are stored",
+                    "description": "Local directory where clade-specific BUSCO lineage datasets are stored. Must be a directory containing the 'lineages/' subdirectory.",
+                    "help_text": "BUSCO databases are always directories, never individual files. Do NOT include '/lineages' at the end of the path or point to specific lineage directories. The pipeline will automatically correct paths ending with '/lineages' or pointing to specific lineages. Example: '/path/to/busco_downloads/' not '/path/to/busco_downloads/lineages/' or '/path/to/busco_downloads/lineages/eukaryota_odb10/'",
                     "fa_icon": "fas fa-folder-open"
                 },
                 "lineage_tax_ids": {
@@ -155,7 +156,8 @@
                     "type": "string",
                     "format": "path",
                     "exists": true,
-                    "description": "Path to the nucleotide BLAST database",
+                    "description": "Path to the nucleotide BLAST database. Can be either a directory containing a single database or a direct path to a .nal file.",
+                    "help_text": "For directories with multiple databases, specify the exact .nal file path to avoid ambiguity. When a .nal file is specified, the parent directory is used to ensure all database files are available. Example: '/path/to/databases/nt.nal' instead of '/path/to/databases/'",
                     "fa_icon": "fas fa-file-archive"
                 },
                 "taxdump": {

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -3,8 +3,6 @@
 //
 
 include { samplesheetToList         } from 'plugin/nf-schema'
-
-
 include { UNTAR                     } from '../../modules/nf-core/untar/main'
 include { CAT_CAT                   } from '../../modules/nf-core/cat/cat/main'
 include { SAMTOOLS_FLAGSTAT         } from '../../modules/nf-core/samtools/flagstat/main'
@@ -45,6 +43,14 @@ workflow INPUT_CHECK {
         .map { db_meta, db_path ->
             if (db_meta.type in ["blastp", "blastx"] && db_path.isDirectory()) {
                 [db_meta, file(db_path.toString() + "/${db_path.name}", checkIfExists: true)]
+            } else if (db_meta.type == "blastn") {
+                // Special handling for BLAST nucleotide databases
+                def (resolved_path, db_name) = validateBlastnDatabase(db_path)
+                [db_meta, resolved_path]
+            } else if (db_meta.type == "busco") {
+                // Special handling for BUSCO databases
+                def resolved_path = validateBuscoDatabase(db_path)
+                [db_meta, resolved_path]
             } else {
                 [db_meta, db_path]
             }
@@ -272,3 +278,171 @@ def get_read_counts ( stats ) {
 
     return read_count_meta
 }
+
+/*
+ * Function to validate and resolve BUSCO database paths
+ * Handles the common user error of including '/lineages' at the end of the path
+ */
+def validateBuscoDatabase(db_path) {
+    def path_file = file(db_path)
+    if (path_file.isDirectory()) {
+        // Check if path ends with /lineages and has a parent directory
+        if (path_file.name == 'lineages' && path_file.parent != null) {
+            def parent_dir = file(path_file.parent)
+            log.info "BUSCO path correction: Detected '/lineages' suffix in path"
+            log.info "  Original path: ${path_file}"
+            log.info "  Corrected path: ${parent_dir}"
+            log.info "This prevents the common error where BUSCO tries to use '${path_file}/lineages/lineage_name' instead of '${parent_dir}/lineages/lineage_name'"
+            return parent_dir
+        }
+        // Check if path points to a specific lineage directory (e.g., eukaryota_odb10)
+        else if (path_file.name.endsWith('_odb10') && path_file.parent != null) {
+            def parent_dir = file(path_file.parent)
+            // Check if parent is 'lineages' - if so, we need to go up two levels
+            if (parent_dir.name == 'lineages' && parent_dir.parent != null) {
+                def busco_root = file(parent_dir.parent)
+                log.info "BUSCO path correction: Detected specific lineage directory in path"
+                log.info "  Original path: ${path_file} (specific lineage: ${path_file.name})"
+                log.info "  Corrected path: ${busco_root}"
+                log.info "This prevents the error where BUSCO tries to use a specific lineage directory instead of the root BUSCO database directory"
+                log.warn "Use `--busco_lineages ${path_file.name}` to control the lineage"
+                return busco_root
+            } else {
+                error """
+                ERROR: Invalid BUSCO lineage directory structure: ${path_file}
+                It appears you're pointing to a specific BUSCO lineage directory (${path_file.name}),
+                but the expected directory structure is:
+                /path/to/busco_downloads/lineages/${path_file.name}/
+                Please provide the path to the root BUSCO database directory.
+                Example: --busco /path/to/busco_downloads/
+                """
+            }
+        } else {
+            // Path looks correct, return as-is
+            log.info "Using BUSCO database path: ${path_file}"
+            return path_file
+        }
+    } else {
+        error """
+        ERROR: Invalid BUSCO database path: ${path_file}
+        BUSCO databases must be directories containing the 'lineages/' subdirectory.
+        Please ensure the path points to a valid BUSCO database directory.
+        Common issues:
+        - Path should point to the directory containing 'lineages/' subdirectory
+        - Do NOT include '/lineages' at the end of the path
+        - Do NOT point to a specific lineage directory (e.g., eukaryota_odb10)
+        - BUSCO databases cannot be individual files
+        Example: --busco /path/to/busco_downloads/
+        NOT: --busco /path/to/busco_downloads/lineages/
+        NOT: --busco /path/to/busco_downloads/lineages/eukaryota_odb10/
+        """
+    }
+}
+
+/*
+ * Function to validate and resolve BLAST nucleotide database paths
+ * Handles both directory paths (for backwards compatibility) and direct .nal file paths
+ */
+def validateBlastnDatabase(db_path) {
+    def path_file = file(db_path)
+    if (path_file.isFile()) {
+        // Direct file provided - validate it's a .nal file and create isolated directory
+        if (path_file.name.endsWith('.nal')) {
+            if (!path_file.exists()) {
+                error """
+                ERROR: BLAST database file not found: ${path_file}
+                Please check that the path is correct and the file exists.
+                """
+            }
+            def parent_dir = file(path_file.parent)
+            def db_name = path_file.name.replaceAll('\\.nal$', '')
+
+            // Create a temporary directory with symlinks to only the specified database files
+            def temp_dir = file("${parent_dir}/.btk_isolated_${db_name}")
+            if (!temp_dir.exists()) {
+                temp_dir.mkdirs()
+            }
+
+            // Find all files belonging to this specific database
+            def db_files = parent_dir.listFiles().findAll {
+                it.name.startsWith("${db_name}.") ||
+                it.name in ['taxdb.btd', 'taxdb.bti', 'taxonomy4blast.sqlite3']
+            }
+
+            // Create symlinks in the temporary directory
+            db_files.each { source_file ->
+                def link_file = file("${temp_dir}/${source_file.name}")
+                if (!link_file.exists()) {
+                    // Create symbolic link
+                    link_file.createLink(source_file)
+                }
+            }
+
+            log.info "Direct BLAST database file specified: ${path_file}"
+            log.info "Database name: ${db_name}"
+            log.info "Created isolated directory: ${temp_dir}"
+            log.info "This ensures only the specified database is available to BLAST"
+            return [temp_dir, db_name]
+        } else {
+            error """
+            ERROR: Invalid BLAST database file: ${path_file}
+            The file must have a .nal extension.
+            Please provide either:
+                - A directory containing a single BLAST database
+                - The direct path to a .nal file
+            Example: --blastn /path/to/databases/nt.nal
+            """
+        }
+    } else if (path_file.isDirectory()) {
+        // Directory provided - search for database files
+        log.info "Searching for BLAST database files in directory: ${path_file}"
+        // Look for .nal files
+        def nal_files = path_file.listFiles().findAll { it.name.endsWith('.nal') }
+        if (nal_files.size() == 1) {
+            log.info "Found single BLAST database: ${nal_files[0].name}"
+            return [path_file, null]  // Return directory with no specific db name
+        } else if (nal_files.size() > 1) {
+            def db_names = nal_files.collect { it.name }.join('\n  - ')
+            error """
+            ERROR: Multiple BLAST databases found in ${path_file}:
+                - ${db_names}
+            Please specify the exact path to the .nal file you want to use.
+            Examples:
+                --blastn ${path_file}/${nal_files[0].name}
+                --blastn ${path_file}/${nal_files[1].name}
+            """
+        } else {
+            // Look for .nin files as fallback
+            def nin_files = path_file.listFiles().findAll { it.name.endsWith('.nin') }
+            if (nin_files.size() == 1) {
+                log.info "Found single BLAST database: ${nin_files[0].name}"
+                return [path_file, null]  // Return directory with no specific db name
+            } else if (nin_files.size() > 1) {
+                def db_names = nin_files.collect { it.name }.join('\n  - ')
+                error """
+                ERROR: Multiple BLAST databases found in ${path_file}:
+                    - ${db_names}
+                Please specify the exact path to the .nin file you want to use.
+                Examples:
+                    --blastn ${path_file}/${nin_files[0].name}
+                    --blastn ${path_file}/${nin_files[1].name}
+                """
+            } else {
+                error """
+                ERROR: No BLAST database files (.nal or .nin) found in ${path_file}
+                Please ensure the directory contains a valid BLAST database or
+                specify the direct path to a .nal file.
+                Example: --blastn /path/to/databases/nt.nal
+                """
+            }
+        }
+    } else {
+        error """
+        ERROR: Invalid database path: ${path_file}
+        The path must point to either:
+            - A directory containing a single BLAST database
+            - A direct path to a .nal file
+        Example: --blastn /path/to/databases/nt.nal
+        """
+    }
+}