diff --git a/nextflow/amr-on-wb/.gitignore b/nextflow/amr-on-wb/.gitignore new file mode 100644 index 0000000..7cc7383 --- /dev/null +++ b/nextflow/amr-on-wb/.gitignore @@ -0,0 +1,11 @@ +.nextflow/ +work/ +.nextflow.log* + +bin/AMRplusplus_SNP/ +test_results/ + +.ipynb_checkpoints + +scripts/config/*.env +!scripts/config/*.env.template diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md index ac263f0..da1e4fd 100755 --- a/nextflow/amr-on-wb/README.md +++ b/nextflow/amr-on-wb/README.md @@ -1,157 +1,208 @@ -Overview --------- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Nextflow](https://img.shields.io/badge/Nextflow-%E2%89%A50.25.1-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/Nextflow-v24-brightgreen.svg)](https://www.nextflow.io/) +# AMR++ Bioinformatic Pipeline -# AMR++ bioinformatic pipeline -(https://megares.meglab.org/) +AMR++ is a bioinformatic pipeline for analyzing raw sequencing reads to characterize the profile of antimicrobial resistance genes, or resistome. Developed to work with the [MEGARes database](https://megares.meglab.org/), it contains sequence data for approximately 9,000 hand-curated antimicrobial resistance genes with an annotation structure optimized for high-throughput sequencing and metagenomic analysis. -AMR++ is a bioinformatic pipeline meant to aid in the analysis of raw sequencing reads to characterize the profile of antimicrobial resistance genes, or resistome. AMR++ was developed to work in conjuction with the the MEGARes database which contains sequence data for approximately 9,000 hand-curated antimicrobial resistance genes accompanied by an annotation structure that is optimized for use with high throughput sequencing and metagenomic analysis. The acyclical annotation graph of MEGARes allows for accurate, count-based, hierarchical statistical analysis of resistance at the population level, much like microbiome analysis, and is also designed to be used as a training database for the creation of statistical classifiers. +**This repository demonstrates running AMR++ on Verily Workbench with orchestration through Workbench and compute on Google Batch.** It is adapted from the [original AMR++ pipeline](https://github.com/Microbial-Ecology-Group/AMRplusplus) with simplified scripts for cloud execution. -The goal of many metagenomics studies is to characterize the content and relative abundance of sequences of interest from the DNA of a given sample or set of samples. You may want to know what is contained within your sample or how abundant a given sequence is relative to another. +Additional environments are provided for testing and debugging: +- **Local** - For quick testing and development +- **GCP** - For debugging Google Batch jobs with visible logs (Workbench currently has permissions issues showing Batch logs) -Often, metagenomics is performed when the answer to these questions must be obtained for a large number of targets where techniques like multiplex PCR and other targeted methods would be too cumbersome to perform. AMR++ can process the raw data from the sequencer, identify the fragments of DNA, and count them. It also provides a count of the polymorphisms that occur in each DNA fragment with respect to the reference database. +## Dependencies -Additionally, you may want to know if the depth of your sequencing (how many reads you obtain that are on target) is high enough to identify rare organisms (organisms with low abundance relative to others) in your population. This is referred to as rarefaction and is calculated by randomly subsampling your sequence data at intervals between 0% and 100% in order to determine how many targets are found at each depth. +### Required for Workbench Deployment +- **Verily Workbench CLI** (`wb`) - Workbench command-line tool +- **Google Cloud SDK** (`gcloud`) - GCP command-line tool +- **Docker** - For building and pushing container images (must be running) +- **Nextflow v24** - Workflow orchestration (installed in Workbench app) + - **Note**: v25 has breaking changes and is not compatible with this pipeline -With AMR++, you will obtain alignment count files for each sample that are combined into a count matrix that can be analyzed using any statistical and mathematical techniques that can operate on a matrix of observations. +**Important Notes**: +- Ensure Docker is running on your local machine before executing `./scripts/build.sh --env wb --push` +- You must be an **ADMIN** of the Workbench workspace where this pipeline will run -More Information ----------------- +### Optional Dependencies (for testing/debugging environments) +- **Local**: Docker, Conda +- **GCP**: `gcloud`, Docker -- [Installation](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/installation.md) -- [Usage](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/usage.md) -- [Configuration](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/configuration.md) -- [Output](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/output.md) -- [Dependencies](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/dependencies.md) -- [Software Requirements](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/requirements.md) -- [FAQs](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/FAQs.md) -- [Details on AMR++ updates](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/update_details.md) -- [Contact](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/contact.md) +## AMR++ on Verily Workbench + +**Prerequisites**: +- You must create a Workbench workspace where you have **ADMIN** permissions +- All setup and execution must be done within this workspace + +### Quick Start: Workbench Orchestration with Google Batch + +This guide walks through setting up and running AMR++ with Workbench orchestration and Google Batch compute. The setup is split between local commands (for infrastructure) and Workbench app commands (for execution). + +#### Step 1: Create Workspace and App +Create a new workspace and app in the Workbench UI (or use the CLI if preferred). -## AMR++ demonstration +#### Step 2: Local Setup -If anaconda is already installed, we'll just need to download the AMR++ github repository and create the AMR++ conda environment. Please review the [installation document](docs/installation.md) for alternative methods to install AMR++ in your computing environment. +Run these commands on your **local machine**: ```bash -# Confirm conda works -conda -h +# Set your active workspace (replace with your workspace ID) +wb workspace set --id=your-workspace-id + +# Copy the Workbench environment template +cp scripts/config/wb.env.template scripts/config/wb.env ``` -Clone the AMR++ repository. +Edit `scripts/config/wb.env` and set the user-defined variables: +- `GCS_BUCKET`: Your Workbench GCS bucket resource ID (e.g., `nf-output`) +- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repo (e.g., `nextflow-containers`) +- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) + +**Notes**: +- Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations +- **Future Improvement**: Consider using separate buckets for input data and Nextflow output to better organize resources + +Then run: ```bash -git clone https://github.com/Microbial-Ecology-Group/AMRplusplus.git +# Set up infrastructure (creates buckets, service accounts, etc.) +./scripts/setup_infra.sh wb + +# Upload input data to GCS +./scripts/upload_data.sh wb + +# Build Docker image and push to Artifact Registry +# NOTE: Docker must be running before executing this command +./scripts/build.sh --env wb --push ``` -Navigate into the AMR++ repository and run the test command. -```bash -cd AMRplusplus +#### Step 3: Workbench App Setup -# Now we can use the included recipe to make the AMR++ environment -conda env create -f envs/AMR++_env.yaml -# This can take 5-10 mins (or more) depending on your internet speed, computing resources, etc. +Open your Workbench app, launch the Terminal, and run: -# Once it's completed, activate the environment -conda activate AMR++_env.yaml +```bash +# Clone the repository (adjust branch as needed) +cd repos/ && git clone -b samh/amr-dev https://github.com/verily-src/workbench-examples.git && cd workbench-examples/nextflow/amr-on-wb/ -# You now have access to all the AMR++ software dependencies (locally) -samtools --help +# Copy the environment template +cp scripts/config/wb.env.template scripts/config/wb.env +``` -# Run command to perform the demonstration pipeline using the conda profile. -nextflow run main_AMR++.nf +Now copy your local `wb.env` configuration into the Workbench app. +#### Step 4: Run the Pipeline +```bash +./scripts/run.sh --env wb ``` -Now, you can check out the results in the newly created "test_results" directory. -# Using AMR++ to analyze your data +Results will be stored in your configured GCS bucket. + +**Known Issues**: +- The `gcloud storage cp` command may not correctly resolve Workbench resource names to full `gs://` paths when running `upload_data.sh` or `run.sh`. If you encounter path resolution issues, manually specify the full GCS bucket path in your `wb.env` configuration. -AMR++ is customizable to suit your computing needs and analyze your data. Primarily, the ```-profile``` paramater allows you to choose between running AMR++ using a singularity container, docker container, anaconda packages, or a local installation of your software. -All parameters used to control how AMR++ analyzes your data can also be changed as needed in a variety of ways. For full information, review this [configuration document.](docs/configuration.md) +--- +### Alternative: Quick Demo in Workbench JupyterLab (Workbench Execution) -Below is a brief example, the default parameters were run using this command (with the conda environment, AMR++_env, already activated): +For a simple demonstration without Google Batch (both orchestration and execution running in the same Workbench app): -```nextflow run main_AMR++.nf``` +Create a new Workbench workspace and add this git repository in the **Apps** tab. + +Create a JupyterLab app instance, launch it, and open the terminal: -To change the reads that were analyzed, you should specify the ```--reads`` parameters. Here, we can use regular expressions to point to your samples in a different directory. ```bash -nextflow run main_AMR++.nf --reads "path/to/your/reads/*_R{1,2}.fastq.gz" +# Initialize conda +conda init +source ~/.bashrc + +# Navigate to the repository +cd repos/AMRplusplus + +# Create and activate the conda environment +conda env create -f envs/AMR++_env.yaml +conda activate AMR++_env + +# Verify Nextflow version 24 is installed +nextflow -v + +# Run the test pipeline (takes ~5 minutes) +nextflow run main_AMR++.nf ``` -#### [Here's an extended tutorial to run each AMR++ component individually](docs/Step_by_step_tutorial.md) +Expected output: **Succeeded: 24** with results in `~/repos/AMRplusplus/test_results` +--- +## Supporting Environments -# Optional flags +The following environments are provided for testing and debugging purposes. -## SNP verification +### Local Environment (Testing) -AMR++ now works in conjuction with a [custom SNP verification software](https://github.com/Isabella136/AmrPlusPlus_SNP) to evaluate alignments to gene accessions requiring SNP confirmation to confer resistance. To include this workflow, include the ```--snp Y``` flag in your command like this: +**Purpose**: Quick testing and development on small datasets +**Setup**: ```bash -nextflow run main_AMR++.nf -profile conda --snp Y +cp scripts/config/local.env.template scripts/config/local.env +# Edit local.env and set IMAGE_NAME +./scripts/build.sh +./scripts/run.sh ``` -This will create with the standard count table (AMR_analytic_matrix.csv) in addition to a count matrix with SNP confirmed counts (SNPconfirmed_AMR_analytic_matrix.csv). -## Deduplicated counts +**Requirements**: Docker, Conda -Another option is to include results for deduplicated counts by using the ```--deduped Y``` flag in your command. +### GCP Environment (Debugging) +**Purpose**: Debug Google Batch jobs with visible logs (workaround for Workbench permissions issues) + +**Setup**: ```bash -nextflow run main_AMR++.nf -profile conda --snp Y --deduped Y +cp scripts/config/gcp.env.template scripts/config/gcp.env +# Edit gcp.env and set GCS_BUCKET, GOOGLE_ARTIFACT_REPO +./scripts/setup_infra.sh gcp +./scripts/upload_data.sh gcp +./scripts/build.sh --env gcp --push +./scripts/run.sh --env gcp ``` -With this flag, AMR++ will extract the deduplicated alignments to MEGARes also output a count matrix with deduplicated counts. Since also we included the ```--snp Y``` flag, we will end up with 4 total output count matrices. +**Requirements**: `gcloud` CLI, Docker -# Choosing the right pipeline +**Configuration** (`gcp.env`): +- `GCS_BUCKET`: Your GCS bucket name (without `gs://` prefix) + - Example: `my-nextflow-data` +- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repository name + - Example: `nextflow-containers` +- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) -AMR++ analyzes data by combining workflows that takes a set of sequencing reads through various bioinformatic software. We recommend our standard AMR++ pipeline as a comprehensive way to start from raw sequencing reads, QC assessment, host DNA removal, and resistome analysis with MEGARes. However, users might only want to replicate portions of the pipeline and have more control over their computing needs. Using the ```--pipeline``` parameter, users can now change how AMR++ runs. +## Configuration +### Resource Scaling on Google Batch +Google Batch does NOT automatically scale machine types based on CPU/memory requests. This repository includes automatic resource scaling configuration in `config/google_batch_resources.config`. -## Pipeline workflows -* omitting the ```--pipeline``` flag or using ```--pipeline demo``` - * Simple demonstration on test data +**Key points**: +- Each process that needs more than default resources must explicitly specify a matching `machineType` +- Processes now use `task.cpus` for dynamic thread allocation +- Configure resources using process labels or names -* ```--pipeline standard_AMR``` - * Steps: QC trimming > Host DNA removal > Resistome alignment > Resistome results +See [docs/google-batch-resource-scaling.md](docs/google-batch-resource-scaling.md) for detailed guidance. -* ```--pipeline fast_AMR``` - * This workflow simply skips host removal to speed up analysis. - * Steps: QC trimming > Resistome alignment > Resistome results +## Additional Resources -* ```--pipeline standard_AMR_wKraken``` - * This workflow adds microbiome analysis with kraken. It requires having a local kraken database. The minikraken_8GB_202003 will be downloaded automatically and requires ~8GB of space. Otherwise, you can specify the location to your own database with the flag, ```--kraken_db "/Path/to/KrakenDb/"``` - * Steps: - * QC trimming > Host DNA removal > Resistome alignment > Resistome results - * Non-host reads > Microbiome analysis +**Original AMR++ Repository**: [https://github.com/Microbial-Ecology-Group/AMRplusplus](https://github.com/Microbial-Ecology-Group/AMRplusplus) -## Pipeline subworkflows -* ```--pipeline eval_qc``` - * Evaluate sample QC -* ```--pipeline trim_qc``` - * QC trimming using trimmomatic -* ```--pipeline rm_host``` - * Align reads to host DNA using bwa and remove contaminants -* ```--pipeline resistome``` - * Align reads to MEGARes using bwa, perform rarefaction and resistome analysis -* ```--pipeline kraken``` - * Classify reads taxonomically using kraken. -* ```--pipeline bam_resistome``` - * This will run the resistome pipeline starting with bam files from a previous alignment to MEGARes. - * Need to include ```--bam_files "Path/to/BAM/*.bam"``` in the command line. +For more detailed information about the AMR++ pipeline: -## Example command -In the following example, we'll choose to run the standard AMR++ workflow, which includes QC trimming, host removal, and Resistome analysis. Since we included the ```--snp Y --deduped Y``` flags, we'll also get ouput for deduped counts and SNP confirmed counts. +- [Installation](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/installation.md) +- [Usage](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/usage.md) +- [Configuration](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/configuration.md) +- [Output](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/output.md) +- [Dependencies](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/dependencies.md) +- [FAQs](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/FAQs.md) -Alternatively, you can modify all of these variables and more in the "params.config" file which will be loaded automatically. Just make sure to include the "-profile" and "--pipeline" flags. More information [in this document](docs/configuration.md) +## License -```bash -# Remember to update the --reads flag to match your read location -nextflow run main_AMR++.nf -profile conda --pipeline standard_AMR --reads "path/to/your/reads/*_R{1,2}.fastq.gz" --snp Y --deduped Y -``` +MIT License diff --git a/nextflow/amr-on-wb/config/google_batch_resources.config b/nextflow/amr-on-wb/config/google_batch_resources.config new file mode 100644 index 0000000..833bdb9 --- /dev/null +++ b/nextflow/amr-on-wb/config/google_batch_resources.config @@ -0,0 +1,38 @@ +/* + * Google Batch Resource Configuration + * + * This config file defines process-specific resource requirements + * and corresponding machine types for Google Batch execution. + * + * Google Batch does NOT automatically scale machine types based on + * CPU/memory requests. You must explicitly set machineType for each + * process that needs more than the default resources. + */ + +process { + // Default for all processes + cpus = 4 + memory = '16 GB' + machineType = 'n2-standard-4' + + // Alignment processes - typically need more resources + withLabel: alignment { + cpus = 8 + memory = '32 GB' + machineType = 'n2-standard-8' + } + + // High-memory processes (if needed) + withName: 'bwa_align' { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' + } + + // Example: Even higher resource process + // withName: 'some_process' { + // cpus = 32 + // memory = '128 GB' + // machineType = 'n2-highmem-32' + // } +} diff --git a/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md b/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md new file mode 100644 index 0000000..9823c95 --- /dev/null +++ b/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md @@ -0,0 +1,176 @@ +# Google Batch Resource Scaling Guide + +## Problem + +When running Nextflow on Google Batch, processes that request more CPUs or memory than the default `machineType` will fail because: + +1. **Google Batch does NOT automatically scale machine types** based on CPU/memory requests +2. Setting `process.cpus` or `process.memory` alone is not enough +3. Each process that needs more resources must explicitly specify a matching `machineType` + +## Example Error + +If your default is `n2-standard-8` (8 CPUs) and a process requests 32 CPUs: + +```groovy +withName: 'bwa_align' { + cpus = 32 + memory = "128 GB" +} +``` + +The process will crash trying to run 32 threads on an 8-CPU machine. + +## Solution + +This repository now includes **automatic resource scaling** via `config/google_batch_resources.config`. + +### How It Works + +The config file maps processes to appropriate machine types: + +```groovy +process { + // Default for all processes + cpus = 4 + memory = '16 GB' + machineType = 'n2-standard-4' + + // Label-based configuration + withLabel: alignment { + cpus = 8 + memory = '32 GB' + machineType = 'n2-standard-8' + } + + // Process-specific configuration + withName: 'bwa_align' { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' + } +} +``` + +### Machine Type Selection Guide + +Choose machine types based on your process requirements: + +| vCPUs | Standard (4GB/vCPU) | High-Mem (8GB/vCPU) | High-CPU (0.9GB/vCPU) | +|-------|---------------------|---------------------|------------------------| +| 2 | n2-standard-2 | n2-highmem-2 | n2-highcpu-2 | +| 4 | n2-standard-4 | n2-highmem-4 | n2-highcpu-4 | +| 8 | n2-standard-8 | n2-highmem-8 | n2-highcpu-8 | +| 16 | n2-standard-16 | n2-highmem-16 | n2-highcpu-16 | +| 32 | n2-standard-32 | n2-highmem-32 | n2-highcpu-32 | +| 64 | n2-standard-64 | n2-highmem-64 | n2-highcpu-64 | +| 96 | n2-standard-96 | n2-highmem-96 | n2-highcpu-96 | + +## Usage + +### Method 1: Label-Based (Recommended) + +Add labels to your process definitions: + +```groovy +process bwa_align { + label "alignment" // Will use settings from 'withLabel: alignment' + + input: + ... +} +``` + +Then configure in `config/google_batch_resources.config`: + +```groovy +withLabel: alignment { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' +} +``` + +### Method 2: Process-Specific + +For individual process tuning: + +```groovy +withName: 'specific_process' { + cpus = 32 + memory = '128 GB' + machineType = 'n2-highmem-32' +} +``` + +### Method 3: Dynamic Resources + +Use `task.cpus` in your process scripts to automatically use allocated resources: + +```groovy +process example { + script: + """ + bwa mem -t ${task.cpus} ... + samtools sort -@ ${task.cpus} ... + """ +} +``` + +This is better than hardcoding `${params.threads}` because it adapts to the configured resources. + +## Testing Resource Scaling + +To verify a process uses the correct machine type: + +1. Add a test process to your workflow: + +```groovy +process test_resources { + label "alignment" + + script: + """ + echo "Allocated CPUs: ${task.cpus}" + echo "Allocated Memory: ${task.memory}" + echo "Machine Type: ${task.machineType}" + nproc # Should match allocated CPUs + free -h # Should show allocated memory + """ +} +``` + +2. Check the Google Batch console to confirm the job used the expected machine type + +## Common Pitfalls + +### ❌ Don't do this: + +```groovy +// Setting CPUs without machineType +process.cpus = 32 // Will try to run on default 4-CPU machine! +``` + +### ✅ Do this instead: + +```groovy +withName: 'my_process' { + cpus = 32 + memory = '128 GB' + machineType = 'n2-standard-32' +} +``` + +## Cost Optimization + +- Start with smaller machine types and scale up only when needed +- Use labels to group similar processes +- Monitor your Cloud Billing to track costs per machine type +- Consider `n2-highcpu` for CPU-intensive, low-memory tasks +- Consider spot instances via `google.batch.maxSpotAttempts` (already enabled in this config) + +## Further Reading + +- [Nextflow Google Batch Executor](https://www.nextflow.io/docs/latest/google.html#google-batch) +- [Google Compute Engine Machine Types](https://cloud.google.com/compute/docs/machine-types) +- [Nextflow Process Directives](https://www.nextflow.io/docs/latest/process.html#directives) diff --git a/nextflow/amr-on-wb/envs/AMR++_env.yaml b/nextflow/amr-on-wb/envs/AMR++_env.yaml index 68cb521..7be5f92 100755 --- a/nextflow/amr-on-wb/envs/AMR++_env.yaml +++ b/nextflow/amr-on-wb/envs/AMR++_env.yaml @@ -16,5 +16,5 @@ dependencies: - pandas - trimmomatic - biopython - - nextflow + - nextflow = 24 - git diff --git a/nextflow/amr-on-wb/envs/containers/Dockerfile b/nextflow/amr-on-wb/envs/containers/Dockerfile index 12b2a90..d114e3a 100755 --- a/nextflow/amr-on-wb/envs/containers/Dockerfile +++ b/nextflow/amr-on-wb/envs/containers/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update -q && \ subversion \ wget \ g++ \ + make \ libarchive13 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -59,5 +60,14 @@ RUN set -x && \ find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ /opt/conda/bin/conda clean -afy && \ /opt/conda/bin/conda install -c conda-forge mamba && \ - /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow && \ + /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow=24 && \ conda clean --all + +# Copy AMR++ bin scripts into the container +COPY bin /opt/amrplusplus/bin +RUN chmod +x /opt/amrplusplus/bin/*.py && \ + chmod +x /opt/amrplusplus/bin/rarefaction && \ + chmod +x /opt/amrplusplus/bin/resistome + +# Add bin directory to PATH +ENV PATH="/opt/amrplusplus/bin:${PATH}" diff --git a/nextflow/amr-on-wb/modules/Alignment/bwa.nf b/nextflow/amr-on-wb/modules/Alignment/bwa.nf index a716ae4..c3e8ab4 100755 --- a/nextflow/amr-on-wb/modules/Alignment/bwa.nf +++ b/nextflow/amr-on-wb/modules/Alignment/bwa.nf @@ -53,8 +53,8 @@ process bwa_align { } input: - path indexfiles - tuple val(pair_id), path(reads) + path indexfiles + tuple val(pair_id), path(reads) output: tuple val(pair_id), path("${pair_id}_alignment_sorted.bam"), emit: bwa_bam @@ -63,25 +63,25 @@ process bwa_align { script: if( deduped == "N") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam - ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam + ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam rm ${pair_id}_alignment.sam - ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam + ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam rm ${pair_id}_alignment.bam """ else if( deduped == "Y") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam - ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam + ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam rm ${pair_id}_alignment.sam - ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam + ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam rm ${pair_id}_alignment.bam - ${SAMTOOLS} fixmate -@ ${threads} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam - ${SAMTOOLS} sort -@ ${threads} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam + ${SAMTOOLS} fixmate -@ ${task.cpus} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam + ${SAMTOOLS} sort -@ ${task.cpus} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam rm ${pair_id}_alignment_sorted_fix.bam ${SAMTOOLS} rmdup -S ${pair_id}_alignment_sorted_fix.sorted.bam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_sorted_fix.sorted.bam - ${SAMTOOLS} view -@ ${threads} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam + ${SAMTOOLS} view -@ ${task.cpus} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_dedup.sam """ else @@ -93,8 +93,8 @@ process bwa_rm_contaminant_fq { label "alignment" errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - maxRetries 3 - + maxRetries 3 + publishDir "${params.output}/HostRemoval", mode: "copy", saveAs: { filename -> if(filename.indexOf("fastq.gz") > 0) "NonHostFastq/$filename" @@ -103,21 +103,21 @@ process bwa_rm_contaminant_fq { input: path indexfiles - tuple val(pair_id), path(reads) + tuple val(pair_id), path(reads) output: tuple val(pair_id), path("${pair_id}.non.host.R*.fastq.gz"), emit: nonhost_reads path("${pair_id}.samtools.idxstats"), emit: host_rm_stats - + """ - ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} > ${pair_id}.host.sam - ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${threads} -o ${pair_id}.host.sorted.bam + ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${task.cpus} > ${pair_id}.host.sam + ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${task.cpus} -o ${pair_id}.host.sorted.bam rm ${pair_id}.host.sam ${SAMTOOLS} index ${pair_id}.host.sorted.bam && ${SAMTOOLS} idxstats ${pair_id}.host.sorted.bam > ${pair_id}.samtools.idxstats ${SAMTOOLS} view -h -f 12 -b ${pair_id}.host.sorted.bam -o ${pair_id}.host.sorted.removed.bam - ${SAMTOOLS} sort -n -@ ${threads} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam + ${SAMTOOLS} sort -n -@ ${task.cpus} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam ${SAMTOOLS} \ - fastq -@ ${threads} -c 6 \ + fastq -@ ${task.cpus} -c 6 \ ${pair_id}.host.resorted.removed.bam \ -1 ${pair_id}.non.host.R1.fastq.gz \ -2 ${pair_id}.non.host.R2.fastq.gz \ @@ -147,6 +147,6 @@ process HostRemovalStats { path("host.removal.stats"), emit: combo_host_rm_stats """ - ${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats + ${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats """ } diff --git a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf index 29c807d..3d581c2 100755 --- a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf +++ b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf @@ -5,7 +5,7 @@ threads = params.threads kraken_confidence = params.kraken_confidence process dlkraken { - tag { "downloading kraken db"} + tag { } label "python" errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } @@ -14,11 +14,11 @@ process dlkraken { publishDir "$baseDir/data/kraken_db/", mode: 'copy' output: - path("minikraken_8GB_20200312/"), emit:kraken_db + path("minikraken_8GB_20200312/") """ - wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz - tar -xvzf minikraken_8GB_202003.tgz + wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz + tar -xvzf minikraken_8GB_202003.tgz """ } @@ -49,7 +49,7 @@ process runkraken { """ - ${KRAKEN2} --db ${krakendb} --confidence ${kraken_confidence} --paired ${reads[0]} ${reads[1]} --threads ${threads} --report ${sample_id}.conf_${kraken_confidence}.kraken.report > ${sample_id}.conf_${kraken_confidence}.kraken.raw + ${KRAKEN2} --db ${krakendb} --confidence ${kraken_confidence} --paired ${reads[0]} ${reads[1]} --threads ${task.cpus} --report ${sample_id}.conf_${kraken_confidence}.kraken.report > ${sample_id}.conf_${kraken_confidence}.kraken.raw cut -f 2,3 ${sample_id}.conf_${kraken_confidence}.kraken.raw > ${sample_id}.conf_${kraken_confidence}.kraken.krona """ @@ -73,7 +73,7 @@ process krakenresults { path("unclassifieds_kraken_analytic_matrix.conf_${kraken_confidence}.csv") """ - ${PYTHON3} $baseDir/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.conf_${kraken_confidence}.csv + ${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.conf_${kraken_confidence}.csv """ } @@ -103,3 +103,37 @@ process runbracken { """ } +process kronadb { + label "microbiome" + output: + file("krona_db/taxonomy.tab") optional true into krona_db_ch // is this a value ch? + + when: + !params.skip_krona + + script: + """ + ktUpdateTaxonomy.sh krona_db + """ +} + +process kronafromkraken { + publishDir params.outdir, mode: 'copy' + label "microbiome" + input: + file(x) from kraken2krona_ch.collect() + //file(y) from kaiju2krona_ch.collect() + file("krona_db/taxonomy.tab") from krona_db_ch + + output: + file("*_taxonomy_krona.html") + + when: + !params.skip_krona + + script: + """ + mkdir -p krona + ktImportTaxonomy -o kraken2_taxonomy_krona.html -tax krona_db $x + """ +} diff --git a/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf b/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf index 17035f6..36a8b01 100755 --- a/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf +++ b/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf @@ -47,7 +47,7 @@ process Qiime2Dada2 { path("rep-seqs.qza"), emit: rep_seqs """ - ${QIIME} dada2 denoise-paired --i-demultiplexed-seqs ${demux} --o-table dada-table.qza --o-representative-sequences rep-seqs.qza --p-trim-left-f ${p_trim_left_f} --p-trim-left-r ${p_trim_left_r} --p-trunc-len-f ${p_trunc_len_f} --p-trunc-len-r ${p_trunc_len_r} --p-n-threads ${threads} --verbose --o-denoising-stats denoise_stats + ${QIIME} dada2 denoise-paired --i-demultiplexed-seqs ${demux} --o-table dada-table.qza --o-representative-sequences rep-seqs.qza --p-trim-left-f ${p_trim_left_f} --p-trim-left-r ${p_trim_left_r} --p-trunc-len-f ${p_trunc_len_f} --p-trunc-len-r ${p_trunc_len_r} --p-n-threads ${task.cpus} --verbose --o-denoising-stats denoise_stats """ } diff --git a/nextflow/amr-on-wb/modules/Resistome/resistome.nf b/nextflow/amr-on-wb/modules/Resistome/resistome.nf index 78586ad..7b2b320 100755 --- a/nextflow/amr-on-wb/modules/Resistome/resistome.nf +++ b/nextflow/amr-on-wb/modules/Resistome/resistome.nf @@ -29,28 +29,24 @@ process build_dependencies { output: path("rarefaction"), emit: rarefactionanalyzer path("resistome"), emit: resistomeanalyzer - path("AmrPlusPlus_SNP/*"), emit: amrsnp + path("AmrPlusPlus_SNP"), emit: amrsnp """ - # Uncomment these sections once the v2 rarefactionanalyzer and resistomeanalyzer repositories are updated, remove cp lines - #git clone https://github.com/cdeanj/rarefactionanalyzer.git - #cd rarefactionanalyzer - #make - #chmod 777 rarefaction - #mv rarefaction ../ - #cd ../ - #rm -rf rarefactionanalyzer - cp $baseDir/bin/rarefaction . - - - #git clone https://github.com/cdeanj/resistomeanalyzer.git - #cd resistomeanalyzer - #make - #chmod 777 resistome - #mv resistome ../ - #cd ../ - #rm -rf resistomeanalyzer - cp $baseDir/bin/resistome . + git clone https://github.com/cdeanj/rarefactionanalyzer.git + cd rarefactionanalyzer + make + chmod 777 rarefaction + mv rarefaction ../ + cd ../ + rm -rf rarefactionanalyzer + + git clone https://github.com/cdeanj/resistomeanalyzer.git + cd resistomeanalyzer + make + chmod 777 resistome + mv resistome ../ + cd ../ + rm -rf resistomeanalyzer git clone https://github.com/Isabella136/AmrPlusPlus_SNP.git chmod -R 777 AmrPlusPlus_SNP/ @@ -119,7 +115,7 @@ process resistomeresults { path("${prefix}_analytic_matrix.csv"), emit: snp_count_matrix, optional: true """ - ${PYTHON3} $baseDir/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv """ } @@ -189,7 +185,7 @@ process plotrarefaction { """ mkdir -p data/ mv *.tsv data/ - python $baseDir/bin/rfplot.py --dir ./data --nd --s --sd . + ${PYTHON3} /opt/amrplusplus/bin/rfplot.py --dir ./data --nd --s --sd . """ } @@ -198,6 +194,8 @@ process runsnp { tag {sample_id} label "python" + errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries 3 publishDir "${params.output}/ResistomeAnalysis/SNP_verification", mode: "copy", saveAs: { filename -> @@ -212,6 +210,7 @@ process runsnp { input: tuple val(sample_id), path(bam) path(snp_count_matrix) + path(amrsnp) output: path("${sample_id}.SNP_confirmed_gene.tsv"), emit: snp_counts @@ -219,21 +218,29 @@ process runsnp { path("${sample_id}_${prefix}_SNPresistant_reads.txt") """ - cp -r $baseDir/bin/AmrPlusPlus_SNP/* . + # Copy AmrPlusPlus_SNP files from staged input to current directory for Python imports + if [ -d AmrPlusPlus_SNP ]; then + cp -r AmrPlusPlus_SNP/* . + else + # Files might be staged directly without subdirectory + # In this case they're already in the work dir + : + fi # change name to stay consistent with count matrix name, but only if the names don't match if [ "${bam}" != "${sample_id}.bam" ]; then mv ${bam} ${sample_id}.bam fi - python3 SNP_Verification.py -c config.ini -t ${threads} -a true -i ${sample_id}.bam -o ${sample_id}.${prefix}_SNPs --count_matrix ${snp_count_matrix} --detailed_output = True + python3 SNP_Verification.py -c config.ini -t ${task.cpus} -a true -i ${sample_id}.bam -o ${sample_id}.${prefix}_SNPs --count_matrix ${snp_count_matrix} --detailed_output = True + + cut -d ',' -f `awk -v RS=',' "/${sample_id}/{print NR; exit}" ${sample_id}.${prefix}_SNPs${snp_count_matrix}` ${sample_id}.${prefix}_SNPs${snp_count_matrix} > ${sample_id}.${prefix}_SNP_count_col - python3 $baseDir/bin/extract_snp_column.py \ - --sample-id "${sample_id}" \ - --matrix "${sample_id}.${prefix}_SNPs${snp_count_matrix}" \ - --out-tsv "${sample_id}.SNP_confirmed_gene.tsv" + cut -d ',' -f 1 ${sample_id}.${prefix}_SNPs${snp_count_matrix} > gene_accession_labels - mv */resistant_reads.csv ${sample_id}_${prefix}_SNPresistant_reads.txt + paste gene_accession_labels ${sample_id}.${prefix}_SNP_count_col > ${sample_id}.SNP_confirmed_gene.tsv + + mv ${sample_id}.${prefix}_SNPs${sample_id}/resistant_reads.csv ${sample_id}_${prefix}_SNPresistant_reads.txt """ } @@ -257,8 +264,6 @@ process snpresults { path("*_analytic_matrix.csv"), emit: snp_matrix """ - - ${PYTHON3} $baseDir/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv - + ${PYTHON3} /opt/amrplusplus/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv """ } diff --git a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf index 96371fd..d642d56 100755 --- a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf +++ b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf @@ -31,7 +31,8 @@ process runqc { } input: - tuple val(sample_id), path(reads) + tuple val(sample_id), path(reads) + path adapters_file output: tuple val(sample_id), path("${sample_id}*P.fastq.gz"), emit: paired_fastq @@ -41,15 +42,15 @@ process runqc { """ ${TRIMMOMATIC} \ PE \ - -threads ${threads} \ + -threads ${task.cpus} \ ${reads[0]} ${reads[1]} ${sample_id}.1P.fastq.gz ${sample_id}.1U.fastq.gz ${sample_id}.2P.fastq.gz ${sample_id}.2U.fastq.gz \ - ILLUMINACLIP:${adapters}:2:30:10:3:TRUE \ + ILLUMINACLIP:${adapters_file}:2:30:10:3:TRUE \ LEADING:${leading} \ TRAILING:${trailing} \ SLIDINGWINDOW:${slidingwindow} \ MINLEN:${minlen} \ 2> ${sample_id}.trimmomatic.stats.log - + """ } @@ -73,6 +74,6 @@ process QCstats { path("trimmomatic.stats"), emit: combo_trim_stats """ - ${PYTHON3} $baseDir/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats + ${PYTHON3} /opt/amrplusplus/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats """ } \ No newline at end of file diff --git a/nextflow/amr-on-wb/nextflow.config b/nextflow/amr-on-wb/nextflow.config index 086d748..b6b36f4 100755 --- a/nextflow/amr-on-wb/nextflow.config +++ b/nextflow/amr-on-wb/nextflow.config @@ -47,7 +47,7 @@ profiles { docker { includeConfig "config/local.config" docker.enabled = true - process.container = 'enriquedoster/amrplusplus:latest' + process.container = "${IMAGE_NAME}:${IMAGE_TAG}" } singularity { includeConfig "config/singularity.config" @@ -74,4 +74,52 @@ profiles { singularity.autoMounts = true singularity.cacheDir = "$baseDir/envs/" } + 'google-batch' { + includeConfig "config/google_batch_resources.config" + + process.executor = 'google-batch' + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } + process.maxRetries = 5 + + workDir = "gs://${GCS_BUCKET}/scratch" + + google.region = "${GCS_BUCKET_LOCATION}" + google.project = "$GOOGLE_CLOUD_PROJECT" + google.batch.serviceAccountEmail = "$GOOGLE_SERVICE_ACCOUNT_EMAIL" + google.batch.bootDiskSize = '50 GB' + google.batch.usePrivateAddress = true + google.batch.network = "projects/${GOOGLE_CLOUD_PROJECT}/global/networks/default" + google.batch.subnetwork = "projects/${GOOGLE_CLOUD_PROJECT}/regions/${GCS_BUCKET_LOCATION}/subnetworks/default" + google.httpConnectTimeout = '10m' + google.httpReadTimeout = '10m' + google.batch.maxSpotAttempts = 5 + + executor.pollInterval = '30s' + executor.queueStatInterval = '2m' + } + workbench { + includeConfig "config/google_batch_resources.config" + + process.executor = 'google-batch' + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } + process.maxRetries = 5 + + workDir = "gs://${GCS_BUCKET}/scratch" + + google.region = "${GCS_BUCKET_LOCATION}" + google.project = "$GOOGLE_CLOUD_PROJECT" + google.batch.serviceAccountEmail = "$GOOGLE_SERVICE_ACCOUNT_EMAIL" + google.batch.bootDiskSize = '50 GB' + google.batch.usePrivateAddress = true + google.batch.network = "projects/${GOOGLE_CLOUD_PROJECT}/global/networks/network" + google.batch.subnetwork = "projects/${GOOGLE_CLOUD_PROJECT}/regions/${GCS_BUCKET_LOCATION}/subnetworks/subnetwork" + google.httpConnectTimeout = '10m' + google.httpReadTimeout = '10m' + google.batch.maxSpotAttempts = 5 + + executor.pollInterval = '30s' + executor.queueStatInterval = '2m' + } } diff --git a/nextflow/amr-on-wb/params.config b/nextflow/amr-on-wb/params.config index 86190e3..6e6d308 100755 --- a/nextflow/amr-on-wb/params.config +++ b/nextflow/amr-on-wb/params.config @@ -44,7 +44,7 @@ params { snp = "Y" /* Add deduplicaation analysis */ - deduped = "Y" + deduped = "N" prefix = "AMR" /* Number of threads */ @@ -57,7 +57,6 @@ params { trailing = 3 slidingwindow = "4:15" minlen = 36 - crop_len = 150 /* Resistome threshold */ threshold = 80 diff --git a/nextflow/amr-on-wb/params_google_batch.config b/nextflow/amr-on-wb/params_google_batch.config new file mode 100644 index 0000000..019a50e --- /dev/null +++ b/nextflow/amr-on-wb/params_google_batch.config @@ -0,0 +1,127 @@ +/* * Google Batch Configuration File + * This file sets all parameters for running on Google Cloud Batch. + * It uses the 'GCS_BUCKET' environment variable to define paths. + */ + +// Grabs the GCS_BUCKET environment variable. Defaults to 'nf-files' if not set. +def gcs_bucket = System.getenv("GCS_BUCKET") ?: "nf-files" + +params { + /* Display help message */ + help = false + + // ----------------------------------------------------------------- + // Input Data + // ----------------------------------------------------------------- + /* Location of forward and reverse read pairs */ + reads = "gs://${gcs_bucket}/data/raw/*_R{1,2}.fastq.gz" + + /* Optional input for bam files for use with "--pipeline bam_resistome" */ + bam_files = null + + // ----------------------------------------------------------------- + // Reference Databases + // ----------------------------------------------------------------- + /* Location of reference/host genome */ + host = "gs://${gcs_bucket}/data/host/chr21.fasta.gz" + + /* Host index files created with bwa */ + host_index = "gs://${gcs_bucket}/data/host/chr21.fasta.gz*" + + /* Location of amr index files with wildcard - set to null to build automatically */ + amr_index = null + + /* Location of antimicrobial resistance (MEGARes) database */ + amr = "gs://${gcs_bucket}/data/amr/megares_database_v3.00.fasta" + + /* Location of amr annotation file */ + annotation = "gs://${gcs_bucket}/data/amr/megares_annotations_v3.00.csv" + + /* Kraken database location */ + kraken_db = null + + /* Kraken confidence score */ + kraken_confidence = 0.0 + + // ----------------------------------------------------------------- + // Output + // ----------------------------------------------------------------- + /* Output directory - Saved to Cloud Storage */ + output = "gs://${gcs_bucket}/results/test_results" + + // ----------------------------------------------------------------- + // Pipeline Logic & Analysis Toggles + // ----------------------------------------------------------------- + /* Add SNP analysis */ + snp = "Y" + + /* Add deduplicaation analysis */ + deduped = "N" + prefix = "AMR" + + /* Number of threads */ + threads = 4 + + // ----------------------------------------------------------------- + // Trimming Parameters (Trimmomatic) + // ----------------------------------------------------------------- + /* Adapters file location */ + adapters = "gs://${gcs_bucket}/data/adapters/nextera.fa" + + leading = 3 + trailing = 3 + slidingwindow = "4:15" + minlen = 36 + + // ----------------------------------------------------------------- + // Resistome Analysis Parameters + // ----------------------------------------------------------------- + /* Resistome threshold */ + threshold = 80 + + /* Starting rarefaction level */ + min = 5 + + /* Ending rarefaction level */ + max = 100 + + /* Number of levels to skip */ + skip = 5 + + /* Number of iterations to sample at */ + samples = 1 + + // ----------------------------------------------------------------- + // Other Tools + // ----------------------------------------------------------------- + /* multiQC config folder */ + multiqc = "gs://${gcs_bucket}/data/multiqc" + + /* Qiime2 Parameters */ + p_trim_left_f = 25 + p_trim_left_r = 26 + p_trunc_len_f = 225 + p_trunc_len_r = 220 + + /* qiime2 bayes classifier */ + dada2_db = "gs://${gcs_bucket}/data/qiime/gg-13-8-99-515-806-nb-classifier.qza" +} + +// ----------------------------------------------------------------- +// Environment Variables / Binary Paths +// ----------------------------------------------------------------- +env { + /* These following tools are required to run AmrPlusPlus*/ + JAVA = "java" + TRIMMOMATIC = "trimmomatic" + PYTHON3 = "python3" + BWA = "bwa" + SAMTOOLS = "samtools" + BEDTOOLS = "bedtools" + RESISTOME = "resistome" + RAREFACTION = "rarefaction" + SNPFINDER = "snpfinder" + /* These next tools are optional depending on which analyses you want to run */ + KRAKEN2 = "kraken2" + QIIME = "qiime" +} diff --git a/nextflow/amr-on-wb/scripts/build.sh b/nextflow/amr-on-wb/scripts/build.sh new file mode 100755 index 0000000..19dc25a --- /dev/null +++ b/nextflow/amr-on-wb/scripts/build.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -o errexit +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default values +ENV="local" +PUSH=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --env) + ENV="$2" + shift 2 + ;; + --push) + PUSH=true + shift + ;; + -h|--help) + echo "Usage: $0 [--env local|gcp|wb] [--push]" + echo "" + echo "Options:" + echo " --env Environment to build for (local, gcp, or wb). Default: local" + echo " --push Push the image to the registry after building (only for gcp/wb)" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Build for local environment" + echo " $0 --env gcp --push # Build for GCP and push to registry" + echo " $0 --env wb --push # Build for Workbench and push to registry" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Validate environment +if [[ ! "$ENV" =~ ^(local|gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'local', 'gcp', or 'wb'" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Handle local environment (no push) +if [[ "$ENV" == "local" ]]; then + if [[ "$PUSH" == "true" ]]; then + echo "Warning: --push flag is not applicable for local environment, ignoring" + fi + + echo "Building Docker image for local use: ${IMAGE_NAME}:${IMAGE_TAG}" + docker build --platform linux/amd64 -f envs/containers/Dockerfile -t "${IMAGE_NAME}:${IMAGE_TAG}" . + + echo "" + echo "Image built successfully!" + echo "To run locally: scripts/run.sh --env local" + exit 0 +fi + +# Handle GCP/Workbench environments +echo "Building Docker image: ${REGISTRY_PATH}" +docker build --platform linux/amd64 -f envs/containers/Dockerfile -t "${REGISTRY_PATH}" . + +if [[ "${PUSH}" == "true" ]]; then + echo "" + echo "Configuring Docker authentication for Google Artifact Registry..." + gcloud auth configure-docker us-central1-docker.pkg.dev --quiet + + echo "Pushing image to registry..." + docker push "${REGISTRY_PATH}" + + echo "" + echo "Image successfully pushed to: ${REGISTRY_PATH}" +else + echo "" + echo "Image built successfully!" + echo "To push to registry, run: $0 --env ${ENV} --push" + echo "Or manually push with: docker push ${REGISTRY_PATH}" +fi diff --git a/nextflow/amr-on-wb/scripts/config/gcp.env.template b/nextflow/amr-on-wb/scripts/config/gcp.env.template new file mode 100644 index 0000000..4f0e8b7 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/gcp.env.template @@ -0,0 +1,41 @@ +# GCP environment configuration +# This is used when running Nextflow on Google Batch with local orchestration +# Nextflow runs on your local machine, jobs execute on Google Batch + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# GCS bucket for storing pipeline data and results +# Replace with your GCS bucket name (without gs:// prefix) +# Example: "my-nextflow-data" +export GCS_BUCKET= + +# GCS bucket location/region +# Common values: us-central1, us-east1, europe-west1 +export GCS_BUCKET_LOCATION=us-central1 + +# Google Artifact Registry repository name +# Replace with your artifact registry repository name +# Example: "nextflow-containers" +export GOOGLE_ARTIFACT_REPO= + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +# Google Cloud project (auto-detected from gcloud CLI) +export GOOGLE_CLOUD_PROJECT=$(gcloud config get project) + +# Service account configuration (auto-generated) +export GOOGLE_SERVICE_ACCOUNT_NAME=nextflow-runner +export GOOGLE_SERVICE_ACCOUNT_EMAIL="${GOOGLE_SERVICE_ACCOUNT_NAME}@${GOOGLE_CLOUD_PROJECT}.iam.gserviceaccount.com" + +# Docker image configuration (auto-generated paths) +export IMAGE_NAME="amrplusplus-workbench" +export IMAGE_TAG="latest" +export REGISTRY_PATH="us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + +# Nextflow profile and config +export NEXTFLOW_PROFILE="google-batch" +export NEXTFLOW_CONFIG="params_google_batch.config" diff --git a/nextflow/amr-on-wb/scripts/config/local.env.template b/nextflow/amr-on-wb/scripts/config/local.env.template new file mode 100644 index 0000000..4b80462 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/local.env.template @@ -0,0 +1,18 @@ +# Local environment configuration +# This is used when running Nextflow locally with Docker + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# Docker image configuration +# Replace with your Docker Hub username +# Example: "johndoe/amrplusplus-workbench" +export IMAGE_NAME="/amrplusplus-workbench" + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +export IMAGE_TAG="latest" +export NEXTFLOW_PROFILE="docker" diff --git a/nextflow/amr-on-wb/scripts/config/wb.env.template b/nextflow/amr-on-wb/scripts/config/wb.env.template new file mode 100644 index 0000000..963965c --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/wb.env.template @@ -0,0 +1,43 @@ +# Workbench environment configuration +# This is used when running Nextflow on Google Batch with Workbench orchestration +# Both Nextflow orchestration and job execution happen in Workbench/Google Cloud + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# GCS bucket resource ID (created via Workbench) +# Replace with your Workbench GCS bucket resource ID +# Example: "nf-output" or "my-pipeline-data" +# Note: Use the resource ID, not the full GCS bucket name +export GCS_BUCKET= + +# GCS bucket location/region +# Common values: us-central1, us-east1, europe-west1 +export GCS_BUCKET_LOCATION=us-central1 + +# Google Artifact Registry repository name +# Replace with your artifact registry repository name +# Example: "nextflow-containers" +export GOOGLE_ARTIFACT_REPO= + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +# Google Cloud project (auto-detected from Workbench workspace) +export WORKBENCH_GOOGLE_CLOUD_PROJECT=$(wb status 2>/dev/null | grep "Google project" | awk -F': ' '{print $2}' | xargs) +export GOOGLE_CLOUD_PROJECT="${WORKBENCH_GOOGLE_CLOUD_PROJECT}" + +# Service account configuration (Workbench Pet Service Account - auto-detected) +export GOOGLE_SERVICE_ACCOUNT_EMAIL=$(wb auth status 2>&1 | grep "Service account email" | awk -F': ' '{print $2}' | xargs) +export GOOGLE_SERVICE_ACCOUNT_NAME=$(echo "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" | cut -d'@' -f1) + +# Docker image configuration (auto-generated paths) +export IMAGE_NAME="amrplusplus-workbench" +export IMAGE_TAG="latest" +export REGISTRY_PATH="us-central1-docker.pkg.dev/${WORKBENCH_GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + +# Nextflow profile and config +export NEXTFLOW_PROFILE="workbench" +export NEXTFLOW_CONFIG="params_google_batch.config" diff --git a/nextflow/amr-on-wb/scripts/run.sh b/nextflow/amr-on-wb/scripts/run.sh new file mode 100755 index 0000000..17dbd4b --- /dev/null +++ b/nextflow/amr-on-wb/scripts/run.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +set -o errexit +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default values +ENV="local" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --env) + ENV="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [--env local|gcp|wb]" + echo "" + echo "Options:" + echo " --env Environment to run in (local, gcp, or wb). Default: local" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Run locally with Docker" + echo " $0 --env gcp # Run on Google Batch with local orchestration" + echo " $0 --env wb # Run on Google Batch with Workbench orchestration" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Validate environment +if [[ ! "$ENV" =~ ^(local|gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'local', 'gcp', or 'wb'" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Navigate to repository root +cd "${SCRIPT_DIR}/.." + +# Handle local environment +if [[ "$ENV" == "local" ]]; then + echo "Running Nextflow locally with Docker profile..." + + # Activate conda environment if available + if command -v conda &> /dev/null; then + eval "$(conda shell.bash hook)" + if conda env list | grep -q "AMR++_env"; then + conda activate AMR++_env + else + echo "Warning: AMR++_env conda environment not found" + echo "Please create it with: conda env create -f envs/AMR++_env.yaml" + exit 1 + fi + else + echo "Error: conda not found. Please install conda and create the AMR++_env environment" + exit 1 + fi + + nextflow run main_AMR++.nf -profile "${NEXTFLOW_PROFILE}" + exit 0 +fi + +# Handle GCP/Workbench environments +echo "Running Nextflow on Google Batch..." +echo "Profile: ${NEXTFLOW_PROFILE}" +echo "Config: ${NEXTFLOW_CONFIG}" +echo "" + +# Handle conda environment for Workbench +if [[ "$ENV" == "wb" ]]; then + if command -v conda &> /dev/null; then + # Initialize conda if not already initialized + if ! grep -q "conda initialize" ~/.bashrc 2>/dev/null; then + echo "Initializing conda..." + conda init bash + source ~/.bashrc + fi + + # Ensure conda is properly initialized for this shell + eval "$(conda shell.bash hook)" + + # Check if AMR++_env exists, create if it doesn't + if ! conda env list | grep -q "AMR++_env"; then + echo "Creating AMR++_env conda environment with Nextflow v24..." + echo "This may take a few minutes..." + conda env create -f envs/AMR++_env.yaml + echo "AMR++_env environment created successfully" + else + echo "AMR++_env conda environment found" + fi + + # Activate the environment + echo "Activating AMR++_env conda environment..." + conda activate AMR++_env + + # Verify Nextflow version + echo "Using Nextflow version: $(nextflow -version 2>&1 | grep -o 'version [0-9.]*' | head -1)" + else + echo "Error: conda not found. Please install conda first." + exit 1 + fi +else + # For GCP environment, just try to activate if available + if command -v conda &> /dev/null; then + eval "$(conda shell.bash hook)" + if conda env list | grep -q "AMR++_env"; then + conda activate AMR++_env + else + echo "Warning: AMR++_env conda environment not found, continuing without it" + fi + fi +fi + +# Run nextflow with Google Batch profile +nextflow run main_AMR++.nf -profile "${NEXTFLOW_PROFILE}" -c "${NEXTFLOW_CONFIG}" diff --git a/nextflow/amr-on-wb/scripts/setup_infra.sh b/nextflow/amr-on-wb/scripts/setup_infra.sh new file mode 100755 index 0000000..e606da7 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/setup_infra.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default to GCP environment for infrastructure setup +ENV="${1:-gcp}" + +# Validate environment +if [[ ! "$ENV" =~ ^(gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'gcp' or 'wb'" + echo "Usage: $0 [gcp|wb]" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +if [[ -f ".env" ]]; then + echo "Loading configuration from .env file..." + export $(grep -v '^#' .env | xargs) +fi + +if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | grep -q .; then + 1>&2 echo "ERROR: Not authenticated with gcloud" + 1>&2 echo "Please run: gcloud auth login" + exit 1 +fi + +if [[ -z "${GOOGLE_CLOUD_PROJECT}" ]]; then + 1>&2 echo "ERROR: No default project set in gcloud config" + 1>&2 echo "Please run: gcloud config set project GOOGLE_CLOUD_PROJECT" + exit 1 +fi + +echo "Using project: ${GOOGLE_CLOUD_PROJECT}" + +echo "Configuration:" +echo " Bucket: gs://${GCS_BUCKET}" +echo " Service Account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" +echo " Artifact Repository: ${GOOGLE_ARTIFACT_REPO}" +echo "" + +if [[ "$ENV" == "wb" ]]; then + echo "Skipping API enablement (managed by Workbench)" +else + echo "Enabling required APIs..." + gcloud services enable iamcredentials.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable artifactregistry.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable batch.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable compute.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet +fi + +if [[ "$ENV" == "wb" ]]; then + echo "Skipping VPC and NAT setup (managed by Workbench)" +else + echo "Creating VPC network if it doesn't exist..." + if gcloud compute networks describe default --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Default VPC network already exists" + else + echo "Creating default VPC network..." + gcloud compute networks create default \ + --subnet-mode=auto \ + --project="${GOOGLE_CLOUD_PROJECT}" + + echo "Creating firewall rules for default network..." + gcloud compute firewall-rules create default-allow-internal \ + --network=default \ + --allow=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=10.128.0.0/9 \ + --project="${GOOGLE_CLOUD_PROJECT}" + + gcloud compute firewall-rules create default-allow-ssh \ + --network=default \ + --allow=tcp:22 \ + --source-ranges=0.0.0.0/0 \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + echo "Creating Cloud Router and NAT for private IP access..." + if gcloud compute routers describe nat-router --region="${GCS_BUCKET_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Cloud Router already exists" + else + echo "Creating Cloud Router..." + gcloud compute routers create nat-router \ + --network=default \ + --region="${GCS_BUCKET_LOCATION}" \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + if gcloud compute routers nats describe nat-config --router=nat-router --region="${GCS_BUCKET_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Cloud NAT already exists" + else + echo "Creating Cloud NAT..." + gcloud compute routers nats create nat-config \ + --router=nat-router \ + --region="${GCS_BUCKET_LOCATION}" \ + --nat-all-subnet-ip-ranges \ + --auto-allocate-nat-external-ips \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi +fi + +if [[ "$ENV" == "wb" ]]; then + # For Workbench, use wb resource create + if wb resource describe --id="${GCS_BUCKET}" &>/dev/null; then + echo "Workbench GCS bucket resource already exists: ${GCS_BUCKET}" + else + echo "Creating GCS bucket via Workbench: ${GCS_BUCKET}" + wb resource create gcs-bucket --id="${GCS_BUCKET}" + fi +else + # For GCP, use gcloud storage + if gcloud storage buckets describe "gs://${GCS_BUCKET}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Bucket already exists: gs://${GCS_BUCKET}" + else + echo "Creating GCS bucket: gs://${GCS_BUCKET}" + gcloud storage buckets create "gs://${GCS_BUCKET}" \ + --project="${GOOGLE_CLOUD_PROJECT}" \ + --location="${GCS_BUCKET_LOCATION}" \ + --uniform-bucket-level-access \ + --lifecycle-file="lifecycle.json" + fi +fi + +# For Workbench environment, skip service account creation and IAM bindings +# The Pet SA is managed by Workbench and already has necessary permissions +if [[ "$ENV" == "wb" ]]; then + echo "Using Workbench Pet Service Account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + echo "Skipping IAM policy bindings (managed by Workbench)" +else + # For GCP environment, create service account and grant permissions + if gcloud iam service-accounts describe "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Service account already exists: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + else + echo "Creating service account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + gcloud iam service-accounts create "$GOOGLE_SERVICE_ACCOUNT_NAME" \ + --display-name="GCS Uploader and Signer" \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + echo "Granting storage.objectAdmin role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/storage.objectAdmin" \ + --condition=None \ + --quiet + + echo "Granting iam.serviceAccountTokenCreator role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/iam.serviceAccountTokenCreator" \ + --condition=None \ + --quiet + + echo "Granting batch.agentReporter role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/batch.agentReporter" \ + --condition=None \ + --quiet + + echo "Granting logging.logWriter role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/logging.logWriter" \ + --condition=None \ + --quiet + + echo "Granting artifactregistry.reader role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/artifactregistry.reader" \ + --condition=None \ + --quiet + + USER_EMAIL="$(gcloud config get-value account 2>/dev/null)" + readonly USER_EMAIL + + if [[ -z "${USER_EMAIL}" ]]; then + 1>&2 echo "ERROR: Could not determine current user email" + exit 1 + fi + + echo "Granting ${USER_EMAIL} permission to impersonate service account..." + gcloud iam service-accounts add-iam-policy-binding "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --member="user:${USER_EMAIL}" \ + --role="roles/iam.serviceAccountTokenCreator" \ + --project="${GOOGLE_CLOUD_PROJECT}" \ + --quiet +fi + +ARTIFACT_LOCATION="${GCS_BUCKET_LOCATION:-us-central1}" + +if gcloud artifacts repositories describe "${GOOGLE_ARTIFACT_REPO}" --location="${ARTIFACT_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Artifact repository already exists: ${GOOGLE_ARTIFACT_REPO}" +else + echo "Creating artifact repository: ${GOOGLE_ARTIFACT_REPO}" + echo "Project: ${GOOGLE_CLOUD_PROJECT}" + echo "Location: ${ARTIFACT_LOCATION}" + gcloud artifacts repositories create "${GOOGLE_ARTIFACT_REPO}" \ + --repository-format=docker \ + --location="${ARTIFACT_LOCATION}" \ + --project="${GOOGLE_CLOUD_PROJECT}" +fi + +echo "" +echo "Setup complete!" +echo "" diff --git a/nextflow/amr-on-wb/scripts/upload_data.sh b/nextflow/amr-on-wb/scripts/upload_data.sh new file mode 100755 index 0000000..08a44c9 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/upload_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Source the variables +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default to GCP environment +ENV="${1:-gcp}" + +# Validate environment +if [[ ! "$ENV" =~ ^(gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'gcp' or 'wb'" + echo "Usage: $0 [gcp|wb]" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Set data directory path relative to script location +DATA_DIR="${SCRIPT_DIR}/../data" + +# Upload data to GCS bucket +gcloud storage cp -r "${DATA_DIR}" gs://${GCS_BUCKET}/ diff --git a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf index 8e44779..74f277c 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf @@ -1,5 +1,5 @@ // Deduped functions with prefix for name -include {runresistome as runresistome_dedup ; runsnp as runsnp_dedup; resistomeresults as resistomeresults_dedup ; snpresults as snpresults_dedup ; build_dependencies} from '../modules/Resistome/resistome' addParams(prefix: 'dedup_AMR') +include {runresistome as runresistome_dedup ; runsnp as runsnp_dedup; resistomeresults as resistomeresults_dedup ; snpresults as snpresults_dedup} from '../modules/Resistome/resistome' addParams(prefix: 'dedup_AMR') workflow BAM_DEDUP_RESISTOME_WF { @@ -9,22 +9,14 @@ workflow BAM_DEDUP_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") runresistome_dedup(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults_dedup(runresistome_dedup.out.resistome_counts.collect()) if (params.snp == "Y") { - runsnp_dedup(bam_ch, resistomeresults_dedup.out.snp_count_matrix) + runsnp_dedup(bam_ch, resistomeresults_dedup.out.snp_count_matrix, amrsnp) snpresults_dedup(runsnp_dedup.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf index c4af77e..3dbfdc5 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf @@ -1,5 +1,5 @@ // resistome -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; snpresults} from '../modules/Resistome/resistome' workflow BAM_RESISTOME_WF { @@ -9,18 +9,10 @@ workflow BAM_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") // Split sections below for standard and dedup_ed results runresistome(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults(runresistome.out.resistome_counts.collect()) @@ -28,7 +20,7 @@ workflow BAM_RESISTOME_WF { plotrarefaction(runrarefaction.out.rarefaction.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bam_ch, resistomeresults.out.snp_count_matrix) + runsnp(bam_ch, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf index 680e5cb..b0a4683 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf @@ -1,5 +1,5 @@ // resistome counts and snp verification -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; snpresults} from '../modules/Resistome/resistome' workflow BAM_RESISTOME_COUNTS_WF { @@ -9,22 +9,15 @@ workflow BAM_RESISTOME_COUNTS_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") - resistomeanalyzer = file("${baseDir}/bin/resistome") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") // Split sections below for standard and dedup_ed results runresistome(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults(runresistome.out.resistome_counts.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bam_ch, resistomeresults.out.snp_count_matrix) + runsnp(bam_ch, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf b/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf index 0fa3d31..33d7191 100755 --- a/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf +++ b/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf @@ -3,18 +3,19 @@ include { runqc ; QCstats } from '../modules/Trimming/trimmomatic' // WC trimming workflow FASTQ_TRIM_WF { - take: + take: read_pairs_ch main: //index( hostindex ) //bwa_align( index.out, read_pairs_ch ) - runqc(read_pairs_ch) + adapters_file = file(params.adapters) + runqc(read_pairs_ch, adapters_file) QCstats(runqc.out.trimmomatic_stats.collect()) - + emit: //bwa_align = bwa_align.out - trimmed_reads = runqc.out.paired_fastq - + trimmed_reads = runqc.out.paired_fastq + } diff --git a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf index 21ca3e5..287f66f 100755 --- a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf @@ -2,7 +2,7 @@ include { index ; bwa_align } from '../modules/Alignment/bwa' // resistome -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; snpresults} from '../modules/Resistome/resistome' // Deduped resistome include { BAM_DEDUP_RESISTOME_WF } from '../subworkflows/bam_deduped_resistome.nf' @@ -16,18 +16,10 @@ workflow FASTQ_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") // Define amr_index_files variable if (params.amr_index == null) { index(amr) @@ -55,7 +47,7 @@ workflow FASTQ_RESISTOME_WF { plotrarefaction(runrarefaction.out.rarefaction.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bwa_align.out.bwa_bam, resistomeresults.out.snp_count_matrix) + runsnp(bwa_align.out.bwa_bam, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect() ) } // Add analysis of deduped counts