From ce626a65a9ea5dc85c2a587ec755941c4e7b147c Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Fri, 9 Jan 2026 10:15:21 -0800 Subject: [PATCH 1/6] Initial commit to run AMR++ on workbench --- nextflow/amr-on-wb/.gitignore | 11 + nextflow/amr-on-wb/README.md | 301 ++++++++++++------ nextflow/amr-on-wb/envs/AMR++_env.yaml | 2 +- nextflow/amr-on-wb/envs/containers/Dockerfile | 12 +- nextflow/amr-on-wb/modules/Alignment/bwa.nf | 14 +- .../amr-on-wb/modules/Microbiome/kraken2.nf | 44 ++- .../amr-on-wb/modules/Resistome/resistome.nf | 67 ++-- .../amr-on-wb/modules/Trimming/trimmomatic.nf | 9 +- nextflow/amr-on-wb/nextflow.config | 48 ++- nextflow/amr-on-wb/params.config | 3 +- nextflow/amr-on-wb/params_google_batch.config | 127 ++++++++ nextflow/amr-on-wb/scripts/build.sh | 94 ++++++ .../amr-on-wb/scripts/config/gcp.env.template | 41 +++ .../scripts/config/local.env.template | 18 ++ .../amr-on-wb/scripts/config/wb.env.template | 43 +++ nextflow/amr-on-wb/scripts/run.sh | 97 ++++++ nextflow/amr-on-wb/scripts/setup_infra.sh | 218 +++++++++++++ nextflow/amr-on-wb/scripts/upload_data.sh | 29 ++ .../subworkflows/bam_deduped_resistome.nf | 4 +- .../amr-on-wb/subworkflows/bam_resistome.nf | 4 +- .../subworkflows/bam_resistome_counts.nf | 4 +- .../subworkflows/fastq_QC_trimming.nf | 11 +- .../amr-on-wb/subworkflows/fastq_resistome.nf | 4 +- 23 files changed, 1048 insertions(+), 157 deletions(-) create mode 100644 nextflow/amr-on-wb/.gitignore create mode 100644 nextflow/amr-on-wb/params_google_batch.config create mode 100755 nextflow/amr-on-wb/scripts/build.sh create mode 100644 nextflow/amr-on-wb/scripts/config/gcp.env.template create mode 100644 nextflow/amr-on-wb/scripts/config/local.env.template create mode 100644 nextflow/amr-on-wb/scripts/config/wb.env.template create mode 100755 nextflow/amr-on-wb/scripts/run.sh create mode 100755 nextflow/amr-on-wb/scripts/setup_infra.sh create mode 100755 nextflow/amr-on-wb/scripts/upload_data.sh diff --git a/nextflow/amr-on-wb/.gitignore b/nextflow/amr-on-wb/.gitignore new file mode 100644 index 0000000..7cc7383 --- /dev/null +++ b/nextflow/amr-on-wb/.gitignore @@ -0,0 +1,11 @@ +.nextflow/ +work/ +.nextflow.log* + +bin/AMRplusplus_SNP/ +test_results/ + +.ipynb_checkpoints + +scripts/config/*.env +!scripts/config/*.env.template diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md index ac263f0..d69852c 100755 --- a/nextflow/amr-on-wb/README.md +++ b/nextflow/amr-on-wb/README.md @@ -1,157 +1,274 @@ -Overview --------- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Nextflow](https://img.shields.io/badge/Nextflow-%E2%89%A50.25.1-brightgreen.svg)](https://www.nextflow.io/) +# AMR++ Bioinformatic Pipeline -# AMR++ bioinformatic pipeline -(https://megares.meglab.org/) +AMR++ is a bioinformatic pipeline for analyzing raw sequencing reads to characterize the profile of antimicrobial resistance genes, or resistome. Developed to work with the [MEGARes database](https://megares.meglab.org/), it contains sequence data for approximately 9,000 hand-curated antimicrobial resistance genes with an annotation structure optimized for high-throughput sequencing and metagenomic analysis. -AMR++ is a bioinformatic pipeline meant to aid in the analysis of raw sequencing reads to characterize the profile of antimicrobial resistance genes, or resistome. AMR++ was developed to work in conjuction with the the MEGARes database which contains sequence data for approximately 9,000 hand-curated antimicrobial resistance genes accompanied by an annotation structure that is optimized for use with high throughput sequencing and metagenomic analysis. The acyclical annotation graph of MEGARes allows for accurate, count-based, hierarchical statistical analysis of resistance at the population level, much like microbiome analysis, and is also designed to be used as a training database for the creation of statistical classifiers. +This repository is adapted from the [original AMR++ pipeline](https://github.com/Microbial-Ecology-Group/AMRplusplus) with simplified scripts for running in local, GCP, and Verily Workbench environments. -The goal of many metagenomics studies is to characterize the content and relative abundance of sequences of interest from the DNA of a given sample or set of samples. You may want to know what is contained within your sample or how abundant a given sequence is relative to another. +## Quick Start -Often, metagenomics is performed when the answer to these questions must be obtained for a large number of targets where techniques like multiplex PCR and other targeted methods would be too cumbersome to perform. AMR++ can process the raw data from the sequencer, identify the fragments of DNA, and count them. It also provides a count of the polymorphisms that occur in each DNA fragment with respect to the reference database. +This repository provides simplified scripts for running AMR++ in three different environments: -Additionally, you may want to know if the depth of your sequencing (how many reads you obtain that are on target) is high enough to identify rare organisms (organisms with low abundance relative to others) in your population. This is referred to as rarefaction and is calculated by randomly subsampling your sequence data at intervals between 0% and 100% in order to determine how many targets are found at each depth. +1. **Local** - Run on your machine using Docker +2. **GCP** - Run on Google Batch with local Nextflow orchestration +3. **Workbench** - Run on Google Batch with Verily Workbench -With AMR++, you will obtain alignment count files for each sample that are combined into a count matrix that can be analyzed using any statistical and mathematical techniques that can operate on a matrix of observations. +### Prerequisites -More Information ----------------- +- **For local execution**: Docker, Conda (with AMR++_env), Nextflow +- **For GCP execution**: Google Cloud SDK (`gcloud`), Docker +- **For Workbench execution**: Verily Workbench CLI (`wb`), Google Cloud SDK -- [Installation](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/installation.md) -- [Usage](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/usage.md) -- [Configuration](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/configuration.md) -- [Output](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/output.md) -- [Dependencies](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/dependencies.md) -- [Software Requirements](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/requirements.md) -- [FAQs](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/FAQs.md) -- [Details on AMR++ updates](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/update_details.md) -- [Contact](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/contact.md) +## Environment Setup +### 1. Create Configuration Files +Copy the template files and customize them with your values: -## AMR++ demonstration +```bash +# For local environment +cp scripts/config/local.env.template scripts/config/local.env -If anaconda is already installed, we'll just need to download the AMR++ github repository and create the AMR++ conda environment. Please review the [installation document](docs/installation.md) for alternative methods to install AMR++ in your computing environment. +# For GCP environment +cp scripts/config/gcp.env.template scripts/config/gcp.env -```bash -# Confirm conda works -conda -h +# For Workbench environment +cp scripts/config/wb.env.template scripts/config/wb.env ``` -Clone the AMR++ repository. +### 2. Edit Configuration Files + +Each `.env` file has two sections: +- **USER CONFIGURATION**: Values you must update (marked with ``) +- **AUTOMATIC CONFIGURATION**: Auto-generated values (do not modify) + +#### local.env +Update the following: +- `IMAGE_NAME`: Replace `` with your Docker Hub username + - Example: `"johndoe/amrplusplus-workbench"` + +#### gcp.env +Update the following: +- `GCS_BUCKET`: Replace `` with your GCS bucket name (without `gs://` prefix) + - Example: `my-nextflow-data` +- `GOOGLE_ARTIFACT_REPO`: Replace `` with your artifact registry repository name + - Example: `nextflow-containers` +- `GCS_BUCKET_LOCATION`: Optionally change the region (default: `us-central1`) + +#### wb.env +Update the following: +- `GCS_BUCKET`: Replace `` with your Workbench GCS bucket resource ID + - Example: `nf-output` +- `GOOGLE_ARTIFACT_REPO`: Replace `` with your artifact registry repository name + - Example: `nextflow-containers` +- `GCS_BUCKET_LOCATION`: Optionally change the region (default: `us-central1`) + +**Note:** Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations. + +## Usage + +### Building Docker Images ```bash -git clone https://github.com/Microbial-Ecology-Group/AMRplusplus.git +# Build for local use +./scripts/build.sh + +# Build for GCP and push to registry +./scripts/build.sh --env gcp --push + +# Build for Workbench and push to registry +./scripts/build.sh --env wb --push ``` -Navigate into the AMR++ repository and run the test command. +### Running the Pipeline + ```bash -cd AMRplusplus +# Run locally with Docker +./scripts/run.sh -# Now we can use the included recipe to make the AMR++ environment -conda env create -f envs/AMR++_env.yaml -# This can take 5-10 mins (or more) depending on your internet speed, computing resources, etc. +# Run on Google Batch with local orchestration +./scripts/run.sh --env gcp -# Once it's completed, activate the environment -conda activate AMR++_env.yaml +# Run on Google Batch with Workbench orchestration +./scripts/run.sh --env wb +``` -# You now have access to all the AMR++ software dependencies (locally) -samtools --help +### Infrastructure Setup (Cloud Environments) -# Run command to perform the demonstration pipeline using the conda profile. -nextflow run main_AMR++.nf +For GCP or Workbench environments, set up the required infrastructure first: +```bash +# Set up infrastructure for GCP environment +./scripts/setup_infra.sh gcp +# Set up infrastructure for Workbench environment +./scripts/setup_infra.sh wb ``` -Now, you can check out the results in the newly created "test_results" directory. -# Using AMR++ to analyze your data +### Data Upload (Cloud Environments) + +Upload your data to GCS before running cloud pipelines: + +```bash +# Upload data for GCP environment +./scripts/upload_data.sh gcp + +# Upload data for Workbench environment +./scripts/upload_data.sh wb +``` + +## Environment Details + +### Local Environment +- **Description**: Run Nextflow locally using Docker containers +- **Requirements**: Docker daemon, Conda environment (AMR++_env) +- **Profile**: `docker` +- **Use case**: Testing, development, small datasets + +### GCP Environment +- **Description**: Run on Google Batch with local Nextflow orchestration +- **Requirements**: `gcloud` CLI configured, GCS bucket, Artifact Registry +- **Profile**: `google-batch` +- **Use case**: Large-scale processing with cloud resources, local monitoring -AMR++ is customizable to suit your computing needs and analyze your data. Primarily, the ```-profile``` paramater allows you to choose between running AMR++ using a singularity container, docker container, anaconda packages, or a local installation of your software. -All parameters used to control how AMR++ analyzes your data can also be changed as needed in a variety of ways. For full information, review this [configuration document.](docs/configuration.md) +### Workbench Environment +- **Description**: Run on Google Batch with Verily Workbench orchestration +- **Requirements**: Workbench workspace, `wb` CLI, Workbench-managed resources +- **Profile**: `workbench` +- **Use case**: Collaborative analysis in Verily Workbench environment +## Pipeline Options -Below is a brief example, the default parameters were run using this command (with the conda environment, AMR++_env, already activated): +### Available Pipeline Workflows -```nextflow run main_AMR++.nf``` +AMR++ can be customized using the `--pipeline` parameter: + +- **`demo`** (default): Simple demonstration on test data +- **`standard_AMR`**: QC trimming → Host DNA removal → Resistome alignment → Results +- **`fast_AMR`**: Same as standard but skips host removal for faster analysis +- **`standard_AMR_wKraken`**: Standard AMR + microbiome analysis with Kraken + +### Pipeline Subworkflows + +Run specific components independently: + +- **`eval_qc`**: Evaluate sample QC +- **`trim_qc`**: QC trimming using Trimmomatic +- **`rm_host`**: Remove host DNA contamination +- **`resistome`**: Align to MEGARes, perform rarefaction and resistome analysis +- **`kraken`**: Taxonomic classification +- **`bam_resistome`**: Run resistome analysis starting from BAM files + +### Optional Analysis Flags + +#### SNP Verification +Include SNP-confirmed resistance gene counts: -To change the reads that were analyzed, you should specify the ```--reads`` parameters. Here, we can use regular expressions to point to your samples in a different directory. ```bash -nextflow run main_AMR++.nf --reads "path/to/your/reads/*_R{1,2}.fastq.gz" +nextflow run main_AMR++.nf --pipeline standard_AMR --snp Y ``` -#### [Here's an extended tutorial to run each AMR++ component individually](docs/Step_by_step_tutorial.md) +Output: `SNPconfirmed_AMR_analytic_matrix.csv` (in addition to standard count matrix) +#### Deduplicated Counts +Include deduplicated count analysis: +```bash +nextflow run main_AMR++.nf --pipeline standard_AMR --snp Y --deduped Y +``` -# Optional flags +**Note**: This significantly increases runtime and storage requirements. -## SNP verification +### Example Commands -AMR++ now works in conjuction with a [custom SNP verification software](https://github.com/Isabella136/AmrPlusPlus_SNP) to evaluate alignments to gene accessions requiring SNP confirmation to confer resistance. To include this workflow, include the ```--snp Y``` flag in your command like this: +Run standard AMR++ workflow with all options: ```bash -nextflow run main_AMR++.nf -profile conda --snp Y +# Local execution +nextflow run main_AMR++.nf -profile docker --pipeline standard_AMR --reads "data/raw/*_R{1,2}.fastq.gz" --snp Y --deduped Y + +# GCP execution (after running ./scripts/setup_infra.sh gcp and ./scripts/upload_data.sh gcp) +./scripts/run.sh --env gcp + +# Workbench execution (after running ./scripts/setup_infra.sh wb and ./scripts/upload_data.sh wb) +./scripts/run.sh --env wb ``` -This will create with the standard count table (AMR_analytic_matrix.csv) in addition to a count matrix with SNP confirmed counts (SNPconfirmed_AMR_analytic_matrix.csv). -## Deduplicated counts +## AMR++ on Verily Workbench + +### Quick Demo in Workbench JupyterLab + +Create a new Workbench workspace and add this git repository in the **Apps** tab. -Another option is to include results for deduplicated counts by using the ```--deduped Y``` flag in your command. +Create a JupyterLab app instance, launch it, and open the terminal: ```bash -nextflow run main_AMR++.nf -profile conda --snp Y --deduped Y +# Initialize conda +conda init +source ~/.bashrc + +# Navigate to the repository +cd repos/AMRplusplus + +# Create and activate the conda environment +conda env create -f envs/AMR++_env.yaml +conda activate AMR++_env + +# Verify Nextflow version 24 is installed +nextflow -v + +# Run the test pipeline (takes ~5 minutes) +nextflow run main_AMR++.nf ``` -With this flag, AMR++ will extract the deduplicated alignments to MEGARes also output a count matrix with deduplicated counts. Since also we included the ```--snp Y``` flag, we will end up with 4 total output count matrices. +Expected output: **Succeeded: 24** with results in `~/repos/AMRplusplus/test_results` -# Choosing the right pipeline +### Production Workbench Deployment -AMR++ analyzes data by combining workflows that takes a set of sequencing reads through various bioinformatic software. We recommend our standard AMR++ pipeline as a comprehensive way to start from raw sequencing reads, QC assessment, host DNA removal, and resistome analysis with MEGARes. However, users might only want to replicate portions of the pipeline and have more control over their computing needs. Using the ```--pipeline``` parameter, users can now change how AMR++ runs. +For production use with Google Batch: +1. **Setup infrastructure**: + ```bash + ./scripts/setup_infra.sh wb + ``` +2. **Upload your data**: + ```bash + ./scripts/upload_data.sh wb + ``` -## Pipeline workflows -* omitting the ```--pipeline``` flag or using ```--pipeline demo``` - * Simple demonstration on test data +3. **Run the pipeline**: + ```bash + ./scripts/run.sh --env wb + ``` -* ```--pipeline standard_AMR``` - * Steps: QC trimming > Host DNA removal > Resistome alignment > Resistome results +Results will be stored in your Workbench GCS bucket. -* ```--pipeline fast_AMR``` - * This workflow simply skips host removal to speed up analysis. - * Steps: QC trimming > Resistome alignment > Resistome results +## Configuration Files -* ```--pipeline standard_AMR_wKraken``` - * This workflow adds microbiome analysis with kraken. It requires having a local kraken database. The minikraken_8GB_202003 will be downloaded automatically and requires ~8GB of space. Otherwise, you can specify the location to your own database with the flag, ```--kraken_db "/Path/to/KrakenDb/"``` - * Steps: - * QC trimming > Host DNA removal > Resistome alignment > Resistome results - * Non-host reads > Microbiome analysis +Environment-specific variables are stored in `scripts/config/*.env` files (created from `.env.template` files). These files contain: +- GCS bucket names and locations +- Docker image names and tags +- Service account configurations +- Nextflow profiles and configs -## Pipeline subworkflows -* ```--pipeline eval_qc``` - * Evaluate sample QC -* ```--pipeline trim_qc``` - * QC trimming using trimmomatic -* ```--pipeline rm_host``` - * Align reads to host DNA using bwa and remove contaminants -* ```--pipeline resistome``` - * Align reads to MEGARes using bwa, perform rarefaction and resistome analysis -* ```--pipeline kraken``` - * Classify reads taxonomically using kraken. -* ```--pipeline bam_resistome``` - * This will run the resistome pipeline starting with bam files from a previous alignment to MEGARes. - * Need to include ```--bam_files "Path/to/BAM/*.bam"``` in the command line. +**Security Note**: The `.env` files are gitignored to prevent committing sensitive information. Only template files are tracked in version control. -## Example command -In the following example, we'll choose to run the standard AMR++ workflow, which includes QC trimming, host removal, and Resistome analysis. Since we included the ```--snp Y --deduped Y``` flags, we'll also get ouput for deduped counts and SNP confirmed counts. +## Additional Resources -Alternatively, you can modify all of these variables and more in the "params.config" file which will be loaded automatically. Just make sure to include the "-profile" and "--pipeline" flags. More information [in this document](docs/configuration.md) +**Original AMR++ Repository**: [https://github.com/Microbial-Ecology-Group/AMRplusplus](https://github.com/Microbial-Ecology-Group/AMRplusplus) -```bash -# Remember to update the --reads flag to match your read location -nextflow run main_AMR++.nf -profile conda --pipeline standard_AMR --reads "path/to/your/reads/*_R{1,2}.fastq.gz" --snp Y --deduped Y -``` +For more detailed information about the AMR++ pipeline: + +- [Installation](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/installation.md) +- [Usage](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/usage.md) +- [Configuration](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/configuration.md) +- [Output](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/output.md) +- [Dependencies](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/dependencies.md) +- [FAQs](https://github.com/Microbial-Ecology-Group/AMRplusplus/blob/master/docs/FAQs.md) + +## License + +MIT License diff --git a/nextflow/amr-on-wb/envs/AMR++_env.yaml b/nextflow/amr-on-wb/envs/AMR++_env.yaml index 68cb521..7be5f92 100755 --- a/nextflow/amr-on-wb/envs/AMR++_env.yaml +++ b/nextflow/amr-on-wb/envs/AMR++_env.yaml @@ -16,5 +16,5 @@ dependencies: - pandas - trimmomatic - biopython - - nextflow + - nextflow = 24 - git diff --git a/nextflow/amr-on-wb/envs/containers/Dockerfile b/nextflow/amr-on-wb/envs/containers/Dockerfile index 12b2a90..d114e3a 100755 --- a/nextflow/amr-on-wb/envs/containers/Dockerfile +++ b/nextflow/amr-on-wb/envs/containers/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update -q && \ subversion \ wget \ g++ \ + make \ libarchive13 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -59,5 +60,14 @@ RUN set -x && \ find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ /opt/conda/bin/conda clean -afy && \ /opt/conda/bin/conda install -c conda-forge mamba && \ - /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow && \ + /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow=24 && \ conda clean --all + +# Copy AMR++ bin scripts into the container +COPY bin /opt/amrplusplus/bin +RUN chmod +x /opt/amrplusplus/bin/*.py && \ + chmod +x /opt/amrplusplus/bin/rarefaction && \ + chmod +x /opt/amrplusplus/bin/resistome + +# Add bin directory to PATH +ENV PATH="/opt/amrplusplus/bin:${PATH}" diff --git a/nextflow/amr-on-wb/modules/Alignment/bwa.nf b/nextflow/amr-on-wb/modules/Alignment/bwa.nf index a716ae4..04827be 100755 --- a/nextflow/amr-on-wb/modules/Alignment/bwa.nf +++ b/nextflow/amr-on-wb/modules/Alignment/bwa.nf @@ -53,8 +53,8 @@ process bwa_align { } input: - path indexfiles - tuple val(pair_id), path(reads) + path indexfiles + tuple val(pair_id), path(reads) output: tuple val(pair_id), path("${pair_id}_alignment_sorted.bam"), emit: bwa_bam @@ -93,8 +93,8 @@ process bwa_rm_contaminant_fq { label "alignment" errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - maxRetries 3 - + maxRetries 3 + publishDir "${params.output}/HostRemoval", mode: "copy", saveAs: { filename -> if(filename.indexOf("fastq.gz") > 0) "NonHostFastq/$filename" @@ -103,12 +103,12 @@ process bwa_rm_contaminant_fq { input: path indexfiles - tuple val(pair_id), path(reads) + tuple val(pair_id), path(reads) output: tuple val(pair_id), path("${pair_id}.non.host.R*.fastq.gz"), emit: nonhost_reads path("${pair_id}.samtools.idxstats"), emit: host_rm_stats - + """ ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} > ${pair_id}.host.sam ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${threads} -o ${pair_id}.host.sorted.bam @@ -147,6 +147,6 @@ process HostRemovalStats { path("host.removal.stats"), emit: combo_host_rm_stats """ - ${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats + ${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats """ } diff --git a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf index 29c807d..9da3a05 100755 --- a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf +++ b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf @@ -5,7 +5,7 @@ threads = params.threads kraken_confidence = params.kraken_confidence process dlkraken { - tag { "downloading kraken db"} + tag { } label "python" errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } @@ -14,11 +14,11 @@ process dlkraken { publishDir "$baseDir/data/kraken_db/", mode: 'copy' output: - path("minikraken_8GB_20200312/"), emit:kraken_db + path("minikraken_8GB_20200312/") """ - wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz - tar -xvzf minikraken_8GB_202003.tgz + wget ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz + tar -xvzf minikraken_8GB_202003.tgz """ } @@ -73,7 +73,7 @@ process krakenresults { path("unclassifieds_kraken_analytic_matrix.conf_${kraken_confidence}.csv") """ - ${PYTHON3} $baseDir/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.conf_${kraken_confidence}.csv + ${PYTHON3} /opt/amrplusplus/bin/kraken2_long_to_wide.py -i ${kraken_reports} -o kraken_analytic_matrix.conf_${kraken_confidence}.csv """ } @@ -103,3 +103,37 @@ process runbracken { """ } +process kronadb { + label "microbiome" + output: + file("krona_db/taxonomy.tab") optional true into krona_db_ch // is this a value ch? + + when: + !params.skip_krona + + script: + """ + ktUpdateTaxonomy.sh krona_db + """ +} + +process kronafromkraken { + publishDir params.outdir, mode: 'copy' + label "microbiome" + input: + file(x) from kraken2krona_ch.collect() + //file(y) from kaiju2krona_ch.collect() + file("krona_db/taxonomy.tab") from krona_db_ch + + output: + file("*_taxonomy_krona.html") + + when: + !params.skip_krona + + script: + """ + mkdir -p krona + ktImportTaxonomy -o kraken2_taxonomy_krona.html -tax krona_db $x + """ +} diff --git a/nextflow/amr-on-wb/modules/Resistome/resistome.nf b/nextflow/amr-on-wb/modules/Resistome/resistome.nf index 78586ad..87f1f6b 100755 --- a/nextflow/amr-on-wb/modules/Resistome/resistome.nf +++ b/nextflow/amr-on-wb/modules/Resistome/resistome.nf @@ -29,28 +29,24 @@ process build_dependencies { output: path("rarefaction"), emit: rarefactionanalyzer path("resistome"), emit: resistomeanalyzer - path("AmrPlusPlus_SNP/*"), emit: amrsnp + path("AmrPlusPlus_SNP"), emit: amrsnp """ - # Uncomment these sections once the v2 rarefactionanalyzer and resistomeanalyzer repositories are updated, remove cp lines - #git clone https://github.com/cdeanj/rarefactionanalyzer.git - #cd rarefactionanalyzer - #make - #chmod 777 rarefaction - #mv rarefaction ../ - #cd ../ - #rm -rf rarefactionanalyzer - cp $baseDir/bin/rarefaction . - - - #git clone https://github.com/cdeanj/resistomeanalyzer.git - #cd resistomeanalyzer - #make - #chmod 777 resistome - #mv resistome ../ - #cd ../ - #rm -rf resistomeanalyzer - cp $baseDir/bin/resistome . + git clone https://github.com/cdeanj/rarefactionanalyzer.git + cd rarefactionanalyzer + make + chmod 777 rarefaction + mv rarefaction ../ + cd ../ + rm -rf rarefactionanalyzer + + git clone https://github.com/cdeanj/resistomeanalyzer.git + cd resistomeanalyzer + make + chmod 777 resistome + mv resistome ../ + cd ../ + rm -rf resistomeanalyzer git clone https://github.com/Isabella136/AmrPlusPlus_SNP.git chmod -R 777 AmrPlusPlus_SNP/ @@ -119,7 +115,7 @@ process resistomeresults { path("${prefix}_analytic_matrix.csv"), emit: snp_count_matrix, optional: true """ - ${PYTHON3} $baseDir/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv + ${PYTHON3} /opt/amrplusplus/bin/amr_long_to_wide.py -i ${resistomes} -o ${prefix}_analytic_matrix.csv """ } @@ -189,7 +185,7 @@ process plotrarefaction { """ mkdir -p data/ mv *.tsv data/ - python $baseDir/bin/rfplot.py --dir ./data --nd --s --sd . + ${PYTHON3} /opt/amrplusplus/bin/rfplot.py --dir ./data --nd --s --sd . """ } @@ -198,6 +194,8 @@ process runsnp { tag {sample_id} label "python" + errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries 3 publishDir "${params.output}/ResistomeAnalysis/SNP_verification", mode: "copy", saveAs: { filename -> @@ -212,6 +210,7 @@ process runsnp { input: tuple val(sample_id), path(bam) path(snp_count_matrix) + path(amrsnp) output: path("${sample_id}.SNP_confirmed_gene.tsv"), emit: snp_counts @@ -219,7 +218,14 @@ process runsnp { path("${sample_id}_${prefix}_SNPresistant_reads.txt") """ - cp -r $baseDir/bin/AmrPlusPlus_SNP/* . + # Copy AmrPlusPlus_SNP files from staged input to current directory for Python imports + if [ -d AmrPlusPlus_SNP ]; then + cp -r AmrPlusPlus_SNP/* . + else + # Files might be staged directly without subdirectory + # In this case they're already in the work dir + : + fi # change name to stay consistent with count matrix name, but only if the names don't match if [ "${bam}" != "${sample_id}.bam" ]; then @@ -228,12 +234,13 @@ process runsnp { python3 SNP_Verification.py -c config.ini -t ${threads} -a true -i ${sample_id}.bam -o ${sample_id}.${prefix}_SNPs --count_matrix ${snp_count_matrix} --detailed_output = True - python3 $baseDir/bin/extract_snp_column.py \ - --sample-id "${sample_id}" \ - --matrix "${sample_id}.${prefix}_SNPs${snp_count_matrix}" \ - --out-tsv "${sample_id}.SNP_confirmed_gene.tsv" + cut -d ',' -f `awk -v RS=',' "/${sample_id}/{print NR; exit}" ${sample_id}.${prefix}_SNPs${snp_count_matrix}` ${sample_id}.${prefix}_SNPs${snp_count_matrix} > ${sample_id}.${prefix}_SNP_count_col + + cut -d ',' -f 1 ${sample_id}.${prefix}_SNPs${snp_count_matrix} > gene_accession_labels - mv */resistant_reads.csv ${sample_id}_${prefix}_SNPresistant_reads.txt + paste gene_accession_labels ${sample_id}.${prefix}_SNP_count_col > ${sample_id}.SNP_confirmed_gene.tsv + + mv ${sample_id}.${prefix}_SNPs${sample_id}/resistant_reads.csv ${sample_id}_${prefix}_SNPresistant_reads.txt """ } @@ -257,8 +264,6 @@ process snpresults { path("*_analytic_matrix.csv"), emit: snp_matrix """ - - ${PYTHON3} $baseDir/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv - + ${PYTHON3} /opt/amrplusplus/bin/snp_long_to_wide.py -i ${snp_counts} -o SNPconfirmed_${prefix}_analytic_matrix.csv """ } diff --git a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf index 96371fd..1c07487 100755 --- a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf +++ b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf @@ -31,7 +31,8 @@ process runqc { } input: - tuple val(sample_id), path(reads) + tuple val(sample_id), path(reads) + path adapters_file output: tuple val(sample_id), path("${sample_id}*P.fastq.gz"), emit: paired_fastq @@ -43,13 +44,13 @@ process runqc { PE \ -threads ${threads} \ ${reads[0]} ${reads[1]} ${sample_id}.1P.fastq.gz ${sample_id}.1U.fastq.gz ${sample_id}.2P.fastq.gz ${sample_id}.2U.fastq.gz \ - ILLUMINACLIP:${adapters}:2:30:10:3:TRUE \ + ILLUMINACLIP:${adapters_file}:2:30:10:3:TRUE \ LEADING:${leading} \ TRAILING:${trailing} \ SLIDINGWINDOW:${slidingwindow} \ MINLEN:${minlen} \ 2> ${sample_id}.trimmomatic.stats.log - + """ } @@ -73,6 +74,6 @@ process QCstats { path("trimmomatic.stats"), emit: combo_trim_stats """ - ${PYTHON3} $baseDir/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats + ${PYTHON3} /opt/amrplusplus/bin/trimmomatic_stats.py -i ${stats} -o trimmomatic.stats """ } \ No newline at end of file diff --git a/nextflow/amr-on-wb/nextflow.config b/nextflow/amr-on-wb/nextflow.config index 086d748..ef38db2 100755 --- a/nextflow/amr-on-wb/nextflow.config +++ b/nextflow/amr-on-wb/nextflow.config @@ -47,7 +47,7 @@ profiles { docker { includeConfig "config/local.config" docker.enabled = true - process.container = 'enriquedoster/amrplusplus:latest' + process.container = "${IMAGE_NAME}:${IMAGE_TAG}" } singularity { includeConfig "config/singularity.config" @@ -74,4 +74,50 @@ profiles { singularity.autoMounts = true singularity.cacheDir = "$baseDir/envs/" } + 'google-batch' { + process.executor = 'google-batch' + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } + process.maxRetries = 5 + process.machineType = 'n2-standard-4' // 16 Gbps network for faster image pulls + + workDir = "gs://${GCS_BUCKET}/scratch" + + google.region = "${GCS_BUCKET_LOCATION}" + google.project = "$GOOGLE_CLOUD_PROJECT" + google.batch.serviceAccountEmail = "$GOOGLE_SERVICE_ACCOUNT_EMAIL" + google.batch.bootDiskSize = '50 GB' + google.batch.usePrivateAddress = true + google.batch.network = "projects/${GOOGLE_CLOUD_PROJECT}/global/networks/default" + google.batch.subnetwork = "projects/${GOOGLE_CLOUD_PROJECT}/regions/${GCS_BUCKET_LOCATION}/subnetworks/default" + google.httpConnectTimeout = '10m' + google.httpReadTimeout = '10m' + google.batch.maxSpotAttempts = 5 + + executor.pollInterval = '30s' + executor.queueStatInterval = '2m' + } + workbench { + process.executor = 'google-batch' + process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" + process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } + process.maxRetries = 5 + process.machineType = 'n2-standard-4' // 16 Gbps network for faster image pulls + + workDir = "gs://${GCS_BUCKET}/scratch" + + google.region = "${GCS_BUCKET_LOCATION}" + google.project = "$GOOGLE_CLOUD_PROJECT" + google.batch.serviceAccountEmail = "$GOOGLE_SERVICE_ACCOUNT_EMAIL" + google.batch.bootDiskSize = '50 GB' + google.batch.usePrivateAddress = true + google.batch.network = "projects/${GOOGLE_CLOUD_PROJECT}/global/networks/network" + google.batch.subnetwork = "projects/${GOOGLE_CLOUD_PROJECT}/regions/${GCS_BUCKET_LOCATION}/subnetworks/subnetwork" + google.httpConnectTimeout = '10m' + google.httpReadTimeout = '10m' + google.batch.maxSpotAttempts = 5 + + executor.pollInterval = '30s' + executor.queueStatInterval = '2m' + } } diff --git a/nextflow/amr-on-wb/params.config b/nextflow/amr-on-wb/params.config index 86190e3..6e6d308 100755 --- a/nextflow/amr-on-wb/params.config +++ b/nextflow/amr-on-wb/params.config @@ -44,7 +44,7 @@ params { snp = "Y" /* Add deduplicaation analysis */ - deduped = "Y" + deduped = "N" prefix = "AMR" /* Number of threads */ @@ -57,7 +57,6 @@ params { trailing = 3 slidingwindow = "4:15" minlen = 36 - crop_len = 150 /* Resistome threshold */ threshold = 80 diff --git a/nextflow/amr-on-wb/params_google_batch.config b/nextflow/amr-on-wb/params_google_batch.config new file mode 100644 index 0000000..019a50e --- /dev/null +++ b/nextflow/amr-on-wb/params_google_batch.config @@ -0,0 +1,127 @@ +/* * Google Batch Configuration File + * This file sets all parameters for running on Google Cloud Batch. + * It uses the 'GCS_BUCKET' environment variable to define paths. + */ + +// Grabs the GCS_BUCKET environment variable. Defaults to 'nf-files' if not set. +def gcs_bucket = System.getenv("GCS_BUCKET") ?: "nf-files" + +params { + /* Display help message */ + help = false + + // ----------------------------------------------------------------- + // Input Data + // ----------------------------------------------------------------- + /* Location of forward and reverse read pairs */ + reads = "gs://${gcs_bucket}/data/raw/*_R{1,2}.fastq.gz" + + /* Optional input for bam files for use with "--pipeline bam_resistome" */ + bam_files = null + + // ----------------------------------------------------------------- + // Reference Databases + // ----------------------------------------------------------------- + /* Location of reference/host genome */ + host = "gs://${gcs_bucket}/data/host/chr21.fasta.gz" + + /* Host index files created with bwa */ + host_index = "gs://${gcs_bucket}/data/host/chr21.fasta.gz*" + + /* Location of amr index files with wildcard - set to null to build automatically */ + amr_index = null + + /* Location of antimicrobial resistance (MEGARes) database */ + amr = "gs://${gcs_bucket}/data/amr/megares_database_v3.00.fasta" + + /* Location of amr annotation file */ + annotation = "gs://${gcs_bucket}/data/amr/megares_annotations_v3.00.csv" + + /* Kraken database location */ + kraken_db = null + + /* Kraken confidence score */ + kraken_confidence = 0.0 + + // ----------------------------------------------------------------- + // Output + // ----------------------------------------------------------------- + /* Output directory - Saved to Cloud Storage */ + output = "gs://${gcs_bucket}/results/test_results" + + // ----------------------------------------------------------------- + // Pipeline Logic & Analysis Toggles + // ----------------------------------------------------------------- + /* Add SNP analysis */ + snp = "Y" + + /* Add deduplicaation analysis */ + deduped = "N" + prefix = "AMR" + + /* Number of threads */ + threads = 4 + + // ----------------------------------------------------------------- + // Trimming Parameters (Trimmomatic) + // ----------------------------------------------------------------- + /* Adapters file location */ + adapters = "gs://${gcs_bucket}/data/adapters/nextera.fa" + + leading = 3 + trailing = 3 + slidingwindow = "4:15" + minlen = 36 + + // ----------------------------------------------------------------- + // Resistome Analysis Parameters + // ----------------------------------------------------------------- + /* Resistome threshold */ + threshold = 80 + + /* Starting rarefaction level */ + min = 5 + + /* Ending rarefaction level */ + max = 100 + + /* Number of levels to skip */ + skip = 5 + + /* Number of iterations to sample at */ + samples = 1 + + // ----------------------------------------------------------------- + // Other Tools + // ----------------------------------------------------------------- + /* multiQC config folder */ + multiqc = "gs://${gcs_bucket}/data/multiqc" + + /* Qiime2 Parameters */ + p_trim_left_f = 25 + p_trim_left_r = 26 + p_trunc_len_f = 225 + p_trunc_len_r = 220 + + /* qiime2 bayes classifier */ + dada2_db = "gs://${gcs_bucket}/data/qiime/gg-13-8-99-515-806-nb-classifier.qza" +} + +// ----------------------------------------------------------------- +// Environment Variables / Binary Paths +// ----------------------------------------------------------------- +env { + /* These following tools are required to run AmrPlusPlus*/ + JAVA = "java" + TRIMMOMATIC = "trimmomatic" + PYTHON3 = "python3" + BWA = "bwa" + SAMTOOLS = "samtools" + BEDTOOLS = "bedtools" + RESISTOME = "resistome" + RAREFACTION = "rarefaction" + SNPFINDER = "snpfinder" + /* These next tools are optional depending on which analyses you want to run */ + KRAKEN2 = "kraken2" + QIIME = "qiime" +} diff --git a/nextflow/amr-on-wb/scripts/build.sh b/nextflow/amr-on-wb/scripts/build.sh new file mode 100755 index 0000000..19dc25a --- /dev/null +++ b/nextflow/amr-on-wb/scripts/build.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -o errexit +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default values +ENV="local" +PUSH=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --env) + ENV="$2" + shift 2 + ;; + --push) + PUSH=true + shift + ;; + -h|--help) + echo "Usage: $0 [--env local|gcp|wb] [--push]" + echo "" + echo "Options:" + echo " --env Environment to build for (local, gcp, or wb). Default: local" + echo " --push Push the image to the registry after building (only for gcp/wb)" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Build for local environment" + echo " $0 --env gcp --push # Build for GCP and push to registry" + echo " $0 --env wb --push # Build for Workbench and push to registry" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Validate environment +if [[ ! "$ENV" =~ ^(local|gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'local', 'gcp', or 'wb'" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Handle local environment (no push) +if [[ "$ENV" == "local" ]]; then + if [[ "$PUSH" == "true" ]]; then + echo "Warning: --push flag is not applicable for local environment, ignoring" + fi + + echo "Building Docker image for local use: ${IMAGE_NAME}:${IMAGE_TAG}" + docker build --platform linux/amd64 -f envs/containers/Dockerfile -t "${IMAGE_NAME}:${IMAGE_TAG}" . + + echo "" + echo "Image built successfully!" + echo "To run locally: scripts/run.sh --env local" + exit 0 +fi + +# Handle GCP/Workbench environments +echo "Building Docker image: ${REGISTRY_PATH}" +docker build --platform linux/amd64 -f envs/containers/Dockerfile -t "${REGISTRY_PATH}" . + +if [[ "${PUSH}" == "true" ]]; then + echo "" + echo "Configuring Docker authentication for Google Artifact Registry..." + gcloud auth configure-docker us-central1-docker.pkg.dev --quiet + + echo "Pushing image to registry..." + docker push "${REGISTRY_PATH}" + + echo "" + echo "Image successfully pushed to: ${REGISTRY_PATH}" +else + echo "" + echo "Image built successfully!" + echo "To push to registry, run: $0 --env ${ENV} --push" + echo "Or manually push with: docker push ${REGISTRY_PATH}" +fi diff --git a/nextflow/amr-on-wb/scripts/config/gcp.env.template b/nextflow/amr-on-wb/scripts/config/gcp.env.template new file mode 100644 index 0000000..4f0e8b7 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/gcp.env.template @@ -0,0 +1,41 @@ +# GCP environment configuration +# This is used when running Nextflow on Google Batch with local orchestration +# Nextflow runs on your local machine, jobs execute on Google Batch + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# GCS bucket for storing pipeline data and results +# Replace with your GCS bucket name (without gs:// prefix) +# Example: "my-nextflow-data" +export GCS_BUCKET= + +# GCS bucket location/region +# Common values: us-central1, us-east1, europe-west1 +export GCS_BUCKET_LOCATION=us-central1 + +# Google Artifact Registry repository name +# Replace with your artifact registry repository name +# Example: "nextflow-containers" +export GOOGLE_ARTIFACT_REPO= + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +# Google Cloud project (auto-detected from gcloud CLI) +export GOOGLE_CLOUD_PROJECT=$(gcloud config get project) + +# Service account configuration (auto-generated) +export GOOGLE_SERVICE_ACCOUNT_NAME=nextflow-runner +export GOOGLE_SERVICE_ACCOUNT_EMAIL="${GOOGLE_SERVICE_ACCOUNT_NAME}@${GOOGLE_CLOUD_PROJECT}.iam.gserviceaccount.com" + +# Docker image configuration (auto-generated paths) +export IMAGE_NAME="amrplusplus-workbench" +export IMAGE_TAG="latest" +export REGISTRY_PATH="us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + +# Nextflow profile and config +export NEXTFLOW_PROFILE="google-batch" +export NEXTFLOW_CONFIG="params_google_batch.config" diff --git a/nextflow/amr-on-wb/scripts/config/local.env.template b/nextflow/amr-on-wb/scripts/config/local.env.template new file mode 100644 index 0000000..4b80462 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/local.env.template @@ -0,0 +1,18 @@ +# Local environment configuration +# This is used when running Nextflow locally with Docker + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# Docker image configuration +# Replace with your Docker Hub username +# Example: "johndoe/amrplusplus-workbench" +export IMAGE_NAME="/amrplusplus-workbench" + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +export IMAGE_TAG="latest" +export NEXTFLOW_PROFILE="docker" diff --git a/nextflow/amr-on-wb/scripts/config/wb.env.template b/nextflow/amr-on-wb/scripts/config/wb.env.template new file mode 100644 index 0000000..963965c --- /dev/null +++ b/nextflow/amr-on-wb/scripts/config/wb.env.template @@ -0,0 +1,43 @@ +# Workbench environment configuration +# This is used when running Nextflow on Google Batch with Workbench orchestration +# Both Nextflow orchestration and job execution happen in Workbench/Google Cloud + +############################################################################### +# USER CONFIGURATION - UPDATE THESE VALUES +############################################################################### + +# GCS bucket resource ID (created via Workbench) +# Replace with your Workbench GCS bucket resource ID +# Example: "nf-output" or "my-pipeline-data" +# Note: Use the resource ID, not the full GCS bucket name +export GCS_BUCKET= + +# GCS bucket location/region +# Common values: us-central1, us-east1, europe-west1 +export GCS_BUCKET_LOCATION=us-central1 + +# Google Artifact Registry repository name +# Replace with your artifact registry repository name +# Example: "nextflow-containers" +export GOOGLE_ARTIFACT_REPO= + +############################################################################### +# AUTOMATIC CONFIGURATION - DO NOT MODIFY +############################################################################### + +# Google Cloud project (auto-detected from Workbench workspace) +export WORKBENCH_GOOGLE_CLOUD_PROJECT=$(wb status 2>/dev/null | grep "Google project" | awk -F': ' '{print $2}' | xargs) +export GOOGLE_CLOUD_PROJECT="${WORKBENCH_GOOGLE_CLOUD_PROJECT}" + +# Service account configuration (Workbench Pet Service Account - auto-detected) +export GOOGLE_SERVICE_ACCOUNT_EMAIL=$(wb auth status 2>&1 | grep "Service account email" | awk -F': ' '{print $2}' | xargs) +export GOOGLE_SERVICE_ACCOUNT_NAME=$(echo "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" | cut -d'@' -f1) + +# Docker image configuration (auto-generated paths) +export IMAGE_NAME="amrplusplus-workbench" +export IMAGE_TAG="latest" +export REGISTRY_PATH="us-central1-docker.pkg.dev/${WORKBENCH_GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/${IMAGE_NAME}:${IMAGE_TAG}" + +# Nextflow profile and config +export NEXTFLOW_PROFILE="workbench" +export NEXTFLOW_CONFIG="params_google_batch.config" diff --git a/nextflow/amr-on-wb/scripts/run.sh b/nextflow/amr-on-wb/scripts/run.sh new file mode 100755 index 0000000..5d27430 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/run.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +set -o errexit +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default values +ENV="local" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --env) + ENV="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [--env local|gcp|wb]" + echo "" + echo "Options:" + echo " --env Environment to run in (local, gcp, or wb). Default: local" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Run locally with Docker" + echo " $0 --env gcp # Run on Google Batch with local orchestration" + echo " $0 --env wb # Run on Google Batch with Workbench orchestration" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Validate environment +if [[ ! "$ENV" =~ ^(local|gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'local', 'gcp', or 'wb'" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Navigate to repository root +cd "${SCRIPT_DIR}/.." + +# Handle local environment +if [[ "$ENV" == "local" ]]; then + echo "Running Nextflow locally with Docker profile..." + + # Activate conda environment if available + if command -v conda &> /dev/null; then + eval "$(conda shell.bash hook)" + if conda env list | grep -q "AMR++_env"; then + conda activate AMR++_env + else + echo "Warning: AMR++_env conda environment not found" + echo "Please create it with: conda env create -f envs/AMR++_env.yaml" + exit 1 + fi + else + echo "Error: conda not found. Please install conda and create the AMR++_env environment" + exit 1 + fi + + nextflow run main_AMR++.nf -profile "${NEXTFLOW_PROFILE}" + exit 0 +fi + +# Handle GCP/Workbench environments +echo "Running Nextflow on Google Batch..." +echo "Profile: ${NEXTFLOW_PROFILE}" +echo "Config: ${NEXTFLOW_CONFIG}" +echo "" + +# Activate conda environment if available +if command -v conda &> /dev/null; then + eval "$(conda shell.bash hook)" + if conda env list | grep -q "AMR++_env"; then + conda activate AMR++_env + else + echo "Warning: AMR++_env conda environment not found, continuing without it" + fi +fi + +# Run nextflow with Google Batch profile +nextflow run main_AMR++.nf -profile "${NEXTFLOW_PROFILE}" -c "${NEXTFLOW_CONFIG}" diff --git a/nextflow/amr-on-wb/scripts/setup_infra.sh b/nextflow/amr-on-wb/scripts/setup_infra.sh new file mode 100755 index 0000000..e606da7 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/setup_infra.sh @@ -0,0 +1,218 @@ +#!/bin/bash + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default to GCP environment for infrastructure setup +ENV="${1:-gcp}" + +# Validate environment +if [[ ! "$ENV" =~ ^(gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'gcp' or 'wb'" + echo "Usage: $0 [gcp|wb]" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +if [[ -f ".env" ]]; then + echo "Loading configuration from .env file..." + export $(grep -v '^#' .env | xargs) +fi + +if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" 2>/dev/null | grep -q .; then + 1>&2 echo "ERROR: Not authenticated with gcloud" + 1>&2 echo "Please run: gcloud auth login" + exit 1 +fi + +if [[ -z "${GOOGLE_CLOUD_PROJECT}" ]]; then + 1>&2 echo "ERROR: No default project set in gcloud config" + 1>&2 echo "Please run: gcloud config set project GOOGLE_CLOUD_PROJECT" + exit 1 +fi + +echo "Using project: ${GOOGLE_CLOUD_PROJECT}" + +echo "Configuration:" +echo " Bucket: gs://${GCS_BUCKET}" +echo " Service Account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" +echo " Artifact Repository: ${GOOGLE_ARTIFACT_REPO}" +echo "" + +if [[ "$ENV" == "wb" ]]; then + echo "Skipping API enablement (managed by Workbench)" +else + echo "Enabling required APIs..." + gcloud services enable iamcredentials.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable artifactregistry.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable batch.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet + gcloud services enable compute.googleapis.com --project="${GOOGLE_CLOUD_PROJECT}" --quiet +fi + +if [[ "$ENV" == "wb" ]]; then + echo "Skipping VPC and NAT setup (managed by Workbench)" +else + echo "Creating VPC network if it doesn't exist..." + if gcloud compute networks describe default --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Default VPC network already exists" + else + echo "Creating default VPC network..." + gcloud compute networks create default \ + --subnet-mode=auto \ + --project="${GOOGLE_CLOUD_PROJECT}" + + echo "Creating firewall rules for default network..." + gcloud compute firewall-rules create default-allow-internal \ + --network=default \ + --allow=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=10.128.0.0/9 \ + --project="${GOOGLE_CLOUD_PROJECT}" + + gcloud compute firewall-rules create default-allow-ssh \ + --network=default \ + --allow=tcp:22 \ + --source-ranges=0.0.0.0/0 \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + echo "Creating Cloud Router and NAT for private IP access..." + if gcloud compute routers describe nat-router --region="${GCS_BUCKET_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Cloud Router already exists" + else + echo "Creating Cloud Router..." + gcloud compute routers create nat-router \ + --network=default \ + --region="${GCS_BUCKET_LOCATION}" \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + if gcloud compute routers nats describe nat-config --router=nat-router --region="${GCS_BUCKET_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Cloud NAT already exists" + else + echo "Creating Cloud NAT..." + gcloud compute routers nats create nat-config \ + --router=nat-router \ + --region="${GCS_BUCKET_LOCATION}" \ + --nat-all-subnet-ip-ranges \ + --auto-allocate-nat-external-ips \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi +fi + +if [[ "$ENV" == "wb" ]]; then + # For Workbench, use wb resource create + if wb resource describe --id="${GCS_BUCKET}" &>/dev/null; then + echo "Workbench GCS bucket resource already exists: ${GCS_BUCKET}" + else + echo "Creating GCS bucket via Workbench: ${GCS_BUCKET}" + wb resource create gcs-bucket --id="${GCS_BUCKET}" + fi +else + # For GCP, use gcloud storage + if gcloud storage buckets describe "gs://${GCS_BUCKET}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Bucket already exists: gs://${GCS_BUCKET}" + else + echo "Creating GCS bucket: gs://${GCS_BUCKET}" + gcloud storage buckets create "gs://${GCS_BUCKET}" \ + --project="${GOOGLE_CLOUD_PROJECT}" \ + --location="${GCS_BUCKET_LOCATION}" \ + --uniform-bucket-level-access \ + --lifecycle-file="lifecycle.json" + fi +fi + +# For Workbench environment, skip service account creation and IAM bindings +# The Pet SA is managed by Workbench and already has necessary permissions +if [[ "$ENV" == "wb" ]]; then + echo "Using Workbench Pet Service Account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + echo "Skipping IAM policy bindings (managed by Workbench)" +else + # For GCP environment, create service account and grant permissions + if gcloud iam service-accounts describe "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Service account already exists: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + else + echo "Creating service account: ${GOOGLE_SERVICE_ACCOUNT_EMAIL}" + gcloud iam service-accounts create "$GOOGLE_SERVICE_ACCOUNT_NAME" \ + --display-name="GCS Uploader and Signer" \ + --project="${GOOGLE_CLOUD_PROJECT}" + fi + + echo "Granting storage.objectAdmin role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/storage.objectAdmin" \ + --condition=None \ + --quiet + + echo "Granting iam.serviceAccountTokenCreator role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/iam.serviceAccountTokenCreator" \ + --condition=None \ + --quiet + + echo "Granting batch.agentReporter role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/batch.agentReporter" \ + --condition=None \ + --quiet + + echo "Granting logging.logWriter role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/logging.logWriter" \ + --condition=None \ + --quiet + + echo "Granting artifactregistry.reader role to service account..." + gcloud projects add-iam-policy-binding "${GOOGLE_CLOUD_PROJECT}" \ + --member="serviceAccount:${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --role="roles/artifactregistry.reader" \ + --condition=None \ + --quiet + + USER_EMAIL="$(gcloud config get-value account 2>/dev/null)" + readonly USER_EMAIL + + if [[ -z "${USER_EMAIL}" ]]; then + 1>&2 echo "ERROR: Could not determine current user email" + exit 1 + fi + + echo "Granting ${USER_EMAIL} permission to impersonate service account..." + gcloud iam service-accounts add-iam-policy-binding "${GOOGLE_SERVICE_ACCOUNT_EMAIL}" \ + --member="user:${USER_EMAIL}" \ + --role="roles/iam.serviceAccountTokenCreator" \ + --project="${GOOGLE_CLOUD_PROJECT}" \ + --quiet +fi + +ARTIFACT_LOCATION="${GCS_BUCKET_LOCATION:-us-central1}" + +if gcloud artifacts repositories describe "${GOOGLE_ARTIFACT_REPO}" --location="${ARTIFACT_LOCATION}" --project="${GOOGLE_CLOUD_PROJECT}" &>/dev/null; then + echo "Artifact repository already exists: ${GOOGLE_ARTIFACT_REPO}" +else + echo "Creating artifact repository: ${GOOGLE_ARTIFACT_REPO}" + echo "Project: ${GOOGLE_CLOUD_PROJECT}" + echo "Location: ${ARTIFACT_LOCATION}" + gcloud artifacts repositories create "${GOOGLE_ARTIFACT_REPO}" \ + --repository-format=docker \ + --location="${ARTIFACT_LOCATION}" \ + --project="${GOOGLE_CLOUD_PROJECT}" +fi + +echo "" +echo "Setup complete!" +echo "" diff --git a/nextflow/amr-on-wb/scripts/upload_data.sh b/nextflow/amr-on-wb/scripts/upload_data.sh new file mode 100755 index 0000000..08a44c9 --- /dev/null +++ b/nextflow/amr-on-wb/scripts/upload_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Source the variables +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Default to GCP environment +ENV="${1:-gcp}" + +# Validate environment +if [[ ! "$ENV" =~ ^(gcp|wb)$ ]]; then + echo "Error: Invalid environment '$ENV'. Must be 'gcp' or 'wb'" + echo "Usage: $0 [gcp|wb]" + exit 1 +fi + +# Source environment configuration +CONFIG_FILE="${SCRIPT_DIR}/config/${ENV}.env" +if [[ ! -f "$CONFIG_FILE" ]]; then + echo "Error: Configuration file not found: $CONFIG_FILE" + exit 1 +fi + +source "$CONFIG_FILE" + +# Set data directory path relative to script location +DATA_DIR="${SCRIPT_DIR}/../data" + +# Upload data to GCS bucket +gcloud storage cp -r "${DATA_DIR}" gs://${GCS_BUCKET}/ diff --git a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf index 8e44779..e0ffd43 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf @@ -17,14 +17,14 @@ workflow BAM_DEDUP_RESISTOME_WF { amrsnp = build_dependencies.out.amrsnp } else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") resistomeanalyzer = file("${baseDir}/bin/resistome") rarefactionanalyzer = file("${baseDir}/bin/rarefaction") } runresistome_dedup(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults_dedup(runresistome_dedup.out.resistome_counts.collect()) if (params.snp == "Y") { - runsnp_dedup(bam_ch, resistomeresults_dedup.out.snp_count_matrix) + runsnp_dedup(bam_ch, resistomeresults_dedup.out.snp_count_matrix, amrsnp) snpresults_dedup(runsnp_dedup.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf index c4af77e..eba0621 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf @@ -17,7 +17,7 @@ workflow BAM_RESISTOME_WF { amrsnp = build_dependencies.out.amrsnp } else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") resistomeanalyzer = file("${baseDir}/bin/resistome") rarefactionanalyzer = file("${baseDir}/bin/rarefaction") } @@ -28,7 +28,7 @@ workflow BAM_RESISTOME_WF { plotrarefaction(runrarefaction.out.rarefaction.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bam_ch, resistomeresults.out.snp_count_matrix) + runsnp(bam_ch, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf index 680e5cb..3c852c2 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf @@ -16,7 +16,7 @@ workflow BAM_RESISTOME_COUNTS_WF { amrsnp = build_dependencies.out.amrsnp } else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") resistomeanalyzer = file("${baseDir}/bin/resistome") } // Split sections below for standard and dedup_ed results @@ -24,7 +24,7 @@ workflow BAM_RESISTOME_COUNTS_WF { resistomeresults(runresistome.out.resistome_counts.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bam_ch, resistomeresults.out.snp_count_matrix) + runsnp(bam_ch, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect()) } } diff --git a/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf b/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf index 0fa3d31..33d7191 100755 --- a/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf +++ b/nextflow/amr-on-wb/subworkflows/fastq_QC_trimming.nf @@ -3,18 +3,19 @@ include { runqc ; QCstats } from '../modules/Trimming/trimmomatic' // WC trimming workflow FASTQ_TRIM_WF { - take: + take: read_pairs_ch main: //index( hostindex ) //bwa_align( index.out, read_pairs_ch ) - runqc(read_pairs_ch) + adapters_file = file(params.adapters) + runqc(read_pairs_ch, adapters_file) QCstats(runqc.out.trimmomatic_stats.collect()) - + emit: //bwa_align = bwa_align.out - trimmed_reads = runqc.out.paired_fastq - + trimmed_reads = runqc.out.paired_fastq + } diff --git a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf index 21ca3e5..b0023e8 100755 --- a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf @@ -24,7 +24,7 @@ workflow FASTQ_RESISTOME_WF { amrsnp = build_dependencies.out.amrsnp } else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP/*") + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") resistomeanalyzer = file("${baseDir}/bin/resistome") rarefactionanalyzer = file("${baseDir}/bin/rarefaction") } @@ -55,7 +55,7 @@ workflow FASTQ_RESISTOME_WF { plotrarefaction(runrarefaction.out.rarefaction.collect()) // Add SNP confirmation if (params.snp == "Y") { - runsnp(bwa_align.out.bwa_bam, resistomeresults.out.snp_count_matrix) + runsnp(bwa_align.out.bwa_bam, resistomeresults.out.snp_count_matrix, amrsnp) snpresults(runsnp.out.snp_counts.collect() ) } // Add analysis of deduped counts From e9686af9bb5f3bb315219bc5f3cc15e7338c71a2 Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Fri, 9 Jan 2026 11:56:01 -0800 Subject: [PATCH 2/6] Add conda env management when running in Workbench --- nextflow/amr-on-wb/scripts/run.sh | 45 +++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/nextflow/amr-on-wb/scripts/run.sh b/nextflow/amr-on-wb/scripts/run.sh index 5d27430..17dbd4b 100755 --- a/nextflow/amr-on-wb/scripts/run.sh +++ b/nextflow/amr-on-wb/scripts/run.sh @@ -83,13 +83,48 @@ echo "Profile: ${NEXTFLOW_PROFILE}" echo "Config: ${NEXTFLOW_CONFIG}" echo "" -# Activate conda environment if available -if command -v conda &> /dev/null; then - eval "$(conda shell.bash hook)" - if conda env list | grep -q "AMR++_env"; then +# Handle conda environment for Workbench +if [[ "$ENV" == "wb" ]]; then + if command -v conda &> /dev/null; then + # Initialize conda if not already initialized + if ! grep -q "conda initialize" ~/.bashrc 2>/dev/null; then + echo "Initializing conda..." + conda init bash + source ~/.bashrc + fi + + # Ensure conda is properly initialized for this shell + eval "$(conda shell.bash hook)" + + # Check if AMR++_env exists, create if it doesn't + if ! conda env list | grep -q "AMR++_env"; then + echo "Creating AMR++_env conda environment with Nextflow v24..." + echo "This may take a few minutes..." + conda env create -f envs/AMR++_env.yaml + echo "AMR++_env environment created successfully" + else + echo "AMR++_env conda environment found" + fi + + # Activate the environment + echo "Activating AMR++_env conda environment..." conda activate AMR++_env + + # Verify Nextflow version + echo "Using Nextflow version: $(nextflow -version 2>&1 | grep -o 'version [0-9.]*' | head -1)" else - echo "Warning: AMR++_env conda environment not found, continuing without it" + echo "Error: conda not found. Please install conda first." + exit 1 + fi +else + # For GCP environment, just try to activate if available + if command -v conda &> /dev/null; then + eval "$(conda shell.bash hook)" + if conda env list | grep -q "AMR++_env"; then + conda activate AMR++_env + else + echo "Warning: AMR++_env conda environment not found, continuing without it" + fi fi fi From 50f5e312bd1ffcea9f84c4d083b07f064ed85479 Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Fri, 30 Jan 2026 10:57:11 -0800 Subject: [PATCH 3/6] Skip build_dependencies completely, use pre-built binaries from container --- nextflow/amr-on-wb/README.md | 14 +++----------- .../amr-on-wb/modules/Resistome/resistome.nf | 2 -- .../subworkflows/bam_deduped_resistome.nf | 18 +++++------------- .../amr-on-wb/subworkflows/bam_resistome.nf | 18 +++++------------- .../subworkflows/bam_resistome_counts.nf | 15 ++++----------- .../amr-on-wb/subworkflows/fastq_resistome.nf | 18 +++++------------- 6 files changed, 22 insertions(+), 63 deletions(-) diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md index d69852c..87e17b8 100755 --- a/nextflow/amr-on-wb/README.md +++ b/nextflow/amr-on-wb/README.md @@ -13,7 +13,7 @@ This repository provides simplified scripts for running AMR++ in three different 1. **Local** - Run on your machine using Docker 2. **GCP** - Run on Google Batch with local Nextflow orchestration -3. **Workbench** - Run on Google Batch with Verily Workbench +3. **Workbench** - Run on Google Batch with Verily Workbench orchestration ### Prerequisites @@ -107,6 +107,8 @@ For GCP or Workbench environments, set up the required infrastructure first: ./scripts/setup_infra.sh wb ``` +**Note:** Due to current permissions issues within Workbench, `setup.infra.sh` must be run locally. + ### Data Upload (Cloud Environments) Upload your data to GCS before running cloud pipelines: @@ -246,16 +248,6 @@ For production use with Google Batch: Results will be stored in your Workbench GCS bucket. -## Configuration Files - -Environment-specific variables are stored in `scripts/config/*.env` files (created from `.env.template` files). These files contain: -- GCS bucket names and locations -- Docker image names and tags -- Service account configurations -- Nextflow profiles and configs - -**Security Note**: The `.env` files are gitignored to prevent committing sensitive information. Only template files are tracked in version control. - ## Additional Resources **Original AMR++ Repository**: [https://github.com/Microbial-Ecology-Group/AMRplusplus](https://github.com/Microbial-Ecology-Group/AMRplusplus) diff --git a/nextflow/amr-on-wb/modules/Resistome/resistome.nf b/nextflow/amr-on-wb/modules/Resistome/resistome.nf index 87f1f6b..7e98305 100755 --- a/nextflow/amr-on-wb/modules/Resistome/resistome.nf +++ b/nextflow/amr-on-wb/modules/Resistome/resistome.nf @@ -91,7 +91,6 @@ process runresistome { -group_fp ${sample_id}.${prefix}.group.tsv \ -mech_fp ${sample_id}.${prefix}.mechanism.tsv \ -class_fp ${sample_id}.${prefix}.class.tsv \ - -type_fp ${sample_id}.${prefix}.type.tsv \ -t ${threshold} rm ${sample_id}.sam @@ -152,7 +151,6 @@ process runrarefaction { -group_fp ${sample_id}.group.tsv \ -mech_fp ${sample_id}.mech.tsv \ -class_fp ${sample_id}.class.tsv \ - -type_fp ${sample_id}.type.tsv \ -min ${min} \ -max ${max} \ -skip ${skip} \ diff --git a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf index e0ffd43..74f277c 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_deduped_resistome.nf @@ -1,5 +1,5 @@ // Deduped functions with prefix for name -include {runresistome as runresistome_dedup ; runsnp as runsnp_dedup; resistomeresults as resistomeresults_dedup ; snpresults as snpresults_dedup ; build_dependencies} from '../modules/Resistome/resistome' addParams(prefix: 'dedup_AMR') +include {runresistome as runresistome_dedup ; runsnp as runsnp_dedup; resistomeresults as resistomeresults_dedup ; snpresults as snpresults_dedup} from '../modules/Resistome/resistome' addParams(prefix: 'dedup_AMR') workflow BAM_DEDUP_RESISTOME_WF { @@ -9,18 +9,10 @@ workflow BAM_DEDUP_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") runresistome_dedup(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults_dedup(runresistome_dedup.out.resistome_counts.collect()) if (params.snp == "Y") { diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf index eba0621..3dbfdc5 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome.nf @@ -1,5 +1,5 @@ // resistome -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; snpresults} from '../modules/Resistome/resistome' workflow BAM_RESISTOME_WF { @@ -9,18 +9,10 @@ workflow BAM_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") // Split sections below for standard and dedup_ed results runresistome(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults(runresistome.out.resistome_counts.collect()) diff --git a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf index 3c852c2..b0a4683 100644 --- a/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf +++ b/nextflow/amr-on-wb/subworkflows/bam_resistome_counts.nf @@ -1,5 +1,5 @@ // resistome counts and snp verification -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; snpresults} from '../modules/Resistome/resistome' workflow BAM_RESISTOME_COUNTS_WF { @@ -9,16 +9,9 @@ workflow BAM_RESISTOME_COUNTS_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") - resistomeanalyzer = file("${baseDir}/bin/resistome") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") // Split sections below for standard and dedup_ed results runresistome(bam_ch,amr, annotation, resistomeanalyzer ) resistomeresults(runresistome.out.resistome_counts.collect()) diff --git a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf index b0023e8..287f66f 100755 --- a/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf +++ b/nextflow/amr-on-wb/subworkflows/fastq_resistome.nf @@ -2,7 +2,7 @@ include { index ; bwa_align } from '../modules/Alignment/bwa' // resistome -include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; build_dependencies ; snpresults} from '../modules/Resistome/resistome' +include {plotrarefaction ; runresistome ; runsnp ; resistomeresults ; runrarefaction ; snpresults} from '../modules/Resistome/resistome' // Deduped resistome include { BAM_DEDUP_RESISTOME_WF } from '../subworkflows/bam_deduped_resistome.nf' @@ -16,18 +16,10 @@ workflow FASTQ_RESISTOME_WF { annotation main: - // download resistome and rarefactionanalyzer - if (file("${baseDir}/bin/AmrPlusPlus_SNP/SNP_Verification.py").isEmpty()){ - build_dependencies() - resistomeanalyzer = build_dependencies.out.resistomeanalyzer - rarefactionanalyzer = build_dependencies.out.rarefactionanalyzer - amrsnp = build_dependencies.out.amrsnp - } - else { - amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") - resistomeanalyzer = file("${baseDir}/bin/resistome") - rarefactionanalyzer = file("${baseDir}/bin/rarefaction") - } + // Use pre-built binaries from container + amrsnp = file("${baseDir}/bin/AmrPlusPlus_SNP") + resistomeanalyzer = file("${baseDir}/bin/resistome") + rarefactionanalyzer = file("${baseDir}/bin/rarefaction") // Define amr_index_files variable if (params.amr_index == null) { index(amr) From 7d5908f67ec44631eee90d1962af9ce2d867f558 Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Thu, 5 Feb 2026 14:58:33 -0800 Subject: [PATCH 4/6] Add missing resistome flag back --- nextflow/amr-on-wb/modules/Resistome/resistome.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow/amr-on-wb/modules/Resistome/resistome.nf b/nextflow/amr-on-wb/modules/Resistome/resistome.nf index 7e98305..87f1f6b 100755 --- a/nextflow/amr-on-wb/modules/Resistome/resistome.nf +++ b/nextflow/amr-on-wb/modules/Resistome/resistome.nf @@ -91,6 +91,7 @@ process runresistome { -group_fp ${sample_id}.${prefix}.group.tsv \ -mech_fp ${sample_id}.${prefix}.mechanism.tsv \ -class_fp ${sample_id}.${prefix}.class.tsv \ + -type_fp ${sample_id}.${prefix}.type.tsv \ -t ${threshold} rm ${sample_id}.sam @@ -151,6 +152,7 @@ process runrarefaction { -group_fp ${sample_id}.group.tsv \ -mech_fp ${sample_id}.mech.tsv \ -class_fp ${sample_id}.class.tsv \ + -type_fp ${sample_id}.type.tsv \ -min ${min} \ -max ${max} \ -skip ${skip} \ From 4449c4aad67387028bf819860aa0108ba3868c40 Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Thu, 5 Feb 2026 14:59:13 -0800 Subject: [PATCH 5/6] Update docs with working instructions for running verily orchestration, google batch compute --- nextflow/amr-on-wb/README.md | 269 +++++++++++++---------------------- 1 file changed, 99 insertions(+), 170 deletions(-) diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md index 87e17b8..0336ec8 100755 --- a/nextflow/amr-on-wb/README.md +++ b/nextflow/amr-on-wb/README.md @@ -1,206 +1,113 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Nextflow](https://img.shields.io/badge/Nextflow-%E2%89%A50.25.1-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/Nextflow-v24-brightgreen.svg)](https://www.nextflow.io/) # AMR++ Bioinformatic Pipeline AMR++ is a bioinformatic pipeline for analyzing raw sequencing reads to characterize the profile of antimicrobial resistance genes, or resistome. Developed to work with the [MEGARes database](https://megares.meglab.org/), it contains sequence data for approximately 9,000 hand-curated antimicrobial resistance genes with an annotation structure optimized for high-throughput sequencing and metagenomic analysis. -This repository is adapted from the [original AMR++ pipeline](https://github.com/Microbial-Ecology-Group/AMRplusplus) with simplified scripts for running in local, GCP, and Verily Workbench environments. +**This repository demonstrates running AMR++ on Verily Workbench with orchestration through Workbench and compute on Google Batch.** It is adapted from the [original AMR++ pipeline](https://github.com/Microbial-Ecology-Group/AMRplusplus) with simplified scripts for cloud execution. -## Quick Start +Additional environments are provided for testing and debugging: +- **Local** - For quick testing and development +- **GCP** - For debugging Google Batch jobs with visible logs (Workbench currently has permissions issues showing Batch logs) -This repository provides simplified scripts for running AMR++ in three different environments: +## Dependencies -1. **Local** - Run on your machine using Docker -2. **GCP** - Run on Google Batch with local Nextflow orchestration -3. **Workbench** - Run on Google Batch with Verily Workbench orchestration +### Required for Workbench Deployment +- **Verily Workbench CLI** (`wb`) - Workbench command-line tool +- **Google Cloud SDK** (`gcloud`) - GCP command-line tool +- **Docker** - For building and pushing container images (must be running) +- **Nextflow v24** - Workflow orchestration (installed in Workbench app) + - **Note**: v25 has breaking changes and is not compatible with this pipeline -### Prerequisites +**Important Notes**: +- Ensure Docker is running on your local machine before executing `./scripts/build.sh --env wb --push` +- You must be an **ADMIN** of the Workbench workspace where this pipeline will run -- **For local execution**: Docker, Conda (with AMR++_env), Nextflow -- **For GCP execution**: Google Cloud SDK (`gcloud`), Docker -- **For Workbench execution**: Verily Workbench CLI (`wb`), Google Cloud SDK +### Optional Dependencies (for testing/debugging environments) +- **Local**: Docker, Conda +- **GCP**: `gcloud`, Docker -## Environment Setup -### 1. Create Configuration Files - -Copy the template files and customize them with your values: - -```bash -# For local environment -cp scripts/config/local.env.template scripts/config/local.env - -# For GCP environment -cp scripts/config/gcp.env.template scripts/config/gcp.env - -# For Workbench environment -cp scripts/config/wb.env.template scripts/config/wb.env -``` - -### 2. Edit Configuration Files +## AMR++ on Verily Workbench -Each `.env` file has two sections: -- **USER CONFIGURATION**: Values you must update (marked with ``) -- **AUTOMATIC CONFIGURATION**: Auto-generated values (do not modify) +**Prerequisites**: +- You must create a Workbench workspace where you have **ADMIN** permissions +- All setup and execution must be done within this workspace -#### local.env -Update the following: -- `IMAGE_NAME`: Replace `` with your Docker Hub username - - Example: `"johndoe/amrplusplus-workbench"` +### Quick Start: Workbench Orchestration with Google Batch -#### gcp.env -Update the following: -- `GCS_BUCKET`: Replace `` with your GCS bucket name (without `gs://` prefix) - - Example: `my-nextflow-data` -- `GOOGLE_ARTIFACT_REPO`: Replace `` with your artifact registry repository name - - Example: `nextflow-containers` -- `GCS_BUCKET_LOCATION`: Optionally change the region (default: `us-central1`) +This guide walks through setting up and running AMR++ with Workbench orchestration and Google Batch compute. The setup is split between local commands (for infrastructure) and Workbench app commands (for execution). -#### wb.env -Update the following: -- `GCS_BUCKET`: Replace `` with your Workbench GCS bucket resource ID - - Example: `nf-output` -- `GOOGLE_ARTIFACT_REPO`: Replace `` with your artifact registry repository name - - Example: `nextflow-containers` -- `GCS_BUCKET_LOCATION`: Optionally change the region (default: `us-central1`) +#### Step 1: Create Workspace and App -**Note:** Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations. +Create a new workspace and app in the Workbench UI (or use the CLI if preferred). -## Usage +#### Step 2: Local Setup -### Building Docker Images +Run these commands on your **local machine**: ```bash -# Build for local use -./scripts/build.sh +# Set your active workspace (replace with your workspace ID) +wb workspace set --id=your-workspace-id -# Build for GCP and push to registry -./scripts/build.sh --env gcp --push - -# Build for Workbench and push to registry -./scripts/build.sh --env wb --push +# Copy the Workbench environment template +cp scripts/config/wb.env.template scripts/config/wb.env ``` -### Running the Pipeline - -```bash -# Run locally with Docker -./scripts/run.sh - -# Run on Google Batch with local orchestration -./scripts/run.sh --env gcp - -# Run on Google Batch with Workbench orchestration -./scripts/run.sh --env wb -``` +Edit `scripts/config/wb.env` and set the user-defined variables: +- `GCS_BUCKET`: Your Workbench GCS bucket resource ID (e.g., `nf-output`) +- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repo (e.g., `nextflow-containers`) +- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) -### Infrastructure Setup (Cloud Environments) +**Notes**: +- Project IDs, service accounts, and registry paths are automatically determined from your `gcloud` and `wb` CLI configurations +- **Future Improvement**: Consider using separate buckets for input data and Nextflow output to better organize resources -For GCP or Workbench environments, set up the required infrastructure first: +Then run: ```bash -# Set up infrastructure for GCP environment -./scripts/setup_infra.sh gcp - -# Set up infrastructure for Workbench environment +# Set up infrastructure (creates buckets, service accounts, etc.) ./scripts/setup_infra.sh wb -``` -**Note:** Due to current permissions issues within Workbench, `setup.infra.sh` must be run locally. - -### Data Upload (Cloud Environments) - -Upload your data to GCS before running cloud pipelines: - -```bash -# Upload data for GCP environment -./scripts/upload_data.sh gcp - -# Upload data for Workbench environment +# Upload input data to GCS ./scripts/upload_data.sh wb -``` - -## Environment Details - -### Local Environment -- **Description**: Run Nextflow locally using Docker containers -- **Requirements**: Docker daemon, Conda environment (AMR++_env) -- **Profile**: `docker` -- **Use case**: Testing, development, small datasets - -### GCP Environment -- **Description**: Run on Google Batch with local Nextflow orchestration -- **Requirements**: `gcloud` CLI configured, GCS bucket, Artifact Registry -- **Profile**: `google-batch` -- **Use case**: Large-scale processing with cloud resources, local monitoring -### Workbench Environment -- **Description**: Run on Google Batch with Verily Workbench orchestration -- **Requirements**: Workbench workspace, `wb` CLI, Workbench-managed resources -- **Profile**: `workbench` -- **Use case**: Collaborative analysis in Verily Workbench environment - -## Pipeline Options - -### Available Pipeline Workflows - -AMR++ can be customized using the `--pipeline` parameter: - -- **`demo`** (default): Simple demonstration on test data -- **`standard_AMR`**: QC trimming → Host DNA removal → Resistome alignment → Results -- **`fast_AMR`**: Same as standard but skips host removal for faster analysis -- **`standard_AMR_wKraken`**: Standard AMR + microbiome analysis with Kraken - -### Pipeline Subworkflows - -Run specific components independently: - -- **`eval_qc`**: Evaluate sample QC -- **`trim_qc`**: QC trimming using Trimmomatic -- **`rm_host`**: Remove host DNA contamination -- **`resistome`**: Align to MEGARes, perform rarefaction and resistome analysis -- **`kraken`**: Taxonomic classification -- **`bam_resistome`**: Run resistome analysis starting from BAM files +# Build Docker image and push to Artifact Registry +# NOTE: Docker must be running before executing this command +./scripts/build.sh --env wb --push +``` -### Optional Analysis Flags +#### Step 3: Workbench App Setup -#### SNP Verification -Include SNP-confirmed resistance gene counts: +Open your Workbench app, launch the Terminal, and run: ```bash -nextflow run main_AMR++.nf --pipeline standard_AMR --snp Y +# Clone the repository (adjust branch as needed) +cd repos/ && git clone -b samh/amr-dev https://github.com/verily-src/workbench-examples.git && cd workbench-examples/nextflow/amr-on-wb/ + +# Copy the environment template +cp scripts/config/wb.env.template scripts/config/wb.env ``` -Output: `SNPconfirmed_AMR_analytic_matrix.csv` (in addition to standard count matrix) +Now copy your local `wb.env` configuration into the Workbench app. -#### Deduplicated Counts -Include deduplicated count analysis: +#### Step 4: Run the Pipeline ```bash -nextflow run main_AMR++.nf --pipeline standard_AMR --snp Y --deduped Y +./scripts/run.sh --env wb ``` -**Note**: This significantly increases runtime and storage requirements. - -### Example Commands - -Run standard AMR++ workflow with all options: - -```bash -# Local execution -nextflow run main_AMR++.nf -profile docker --pipeline standard_AMR --reads "data/raw/*_R{1,2}.fastq.gz" --snp Y --deduped Y +Results will be stored in your configured GCS bucket. -# GCP execution (after running ./scripts/setup_infra.sh gcp and ./scripts/upload_data.sh gcp) -./scripts/run.sh --env gcp +**Known Issues**: +- The `gcloud storage cp` command may not correctly resolve Workbench resource names to full `gs://` paths when running `upload_data.sh` or `run.sh`. If you encounter path resolution issues, manually specify the full GCS bucket path in your `wb.env` configuration. -# Workbench execution (after running ./scripts/setup_infra.sh wb and ./scripts/upload_data.sh wb) -./scripts/run.sh --env wb -``` +--- -## AMR++ on Verily Workbench +### Alternative: Quick Demo in Workbench JupyterLab (Workbench Execution) -### Quick Demo in Workbench JupyterLab +For a simple demonstration without Google Batch (both orchestration and execution running in the same Workbench app): Create a new Workbench workspace and add this git repository in the **Apps** tab. @@ -227,26 +134,48 @@ nextflow run main_AMR++.nf Expected output: **Succeeded: 24** with results in `~/repos/AMRplusplus/test_results` -### Production Workbench Deployment +--- -For production use with Google Batch: +## Supporting Environments -1. **Setup infrastructure**: - ```bash - ./scripts/setup_infra.sh wb - ``` +The following environments are provided for testing and debugging purposes. -2. **Upload your data**: - ```bash - ./scripts/upload_data.sh wb - ``` +### Local Environment (Testing) -3. **Run the pipeline**: - ```bash - ./scripts/run.sh --env wb - ``` +**Purpose**: Quick testing and development on small datasets -Results will be stored in your Workbench GCS bucket. +**Setup**: +```bash +cp scripts/config/local.env.template scripts/config/local.env +# Edit local.env and set IMAGE_NAME +./scripts/build.sh +./scripts/run.sh +``` + +**Requirements**: Docker, Conda + +### GCP Environment (Debugging) + +**Purpose**: Debug Google Batch jobs with visible logs (workaround for Workbench permissions issues) + +**Setup**: +```bash +cp scripts/config/gcp.env.template scripts/config/gcp.env +# Edit gcp.env and set GCS_BUCKET, GOOGLE_ARTIFACT_REPO +./scripts/setup_infra.sh gcp +./scripts/upload_data.sh gcp +./scripts/build.sh --env gcp --push +./scripts/run.sh --env gcp +``` + +**Requirements**: `gcloud` CLI, Docker + +**Configuration** (`gcp.env`): +- `GCS_BUCKET`: Your GCS bucket name (without `gs://` prefix) + - Example: `my-nextflow-data` +- `GOOGLE_ARTIFACT_REPO`: Your artifact registry repository name + - Example: `nextflow-containers` +- `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) ## Additional Resources From 1f4157b153b170a7a7c332ab9c61a036f61d0045 Mon Sep 17 00:00:00 2001 From: Samuel Hornstein Date: Mon, 2 Mar 2026 20:32:08 -0800 Subject: [PATCH 6/6] fix(google-batch): add explicit machine type configuration for resource scaling --- nextflow/amr-on-wb/README.md | 13 ++ .../config/google_batch_resources.config | 38 ++++ .../docs/google-batch-resource-scaling.md | 176 ++++++++++++++++++ nextflow/amr-on-wb/modules/Alignment/bwa.nf | 26 +-- .../amr-on-wb/modules/Microbiome/kraken2.nf | 2 +- .../amr-on-wb/modules/Microbiome/qiime2.nf | 2 +- .../amr-on-wb/modules/Resistome/resistome.nf | 2 +- .../amr-on-wb/modules/Trimming/trimmomatic.nf | 2 +- nextflow/amr-on-wb/nextflow.config | 6 +- 9 files changed, 248 insertions(+), 19 deletions(-) create mode 100644 nextflow/amr-on-wb/config/google_batch_resources.config create mode 100644 nextflow/amr-on-wb/docs/google-batch-resource-scaling.md diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md index 0336ec8..da1e4fd 100755 --- a/nextflow/amr-on-wb/README.md +++ b/nextflow/amr-on-wb/README.md @@ -177,6 +177,19 @@ cp scripts/config/gcp.env.template scripts/config/gcp.env - Example: `nextflow-containers` - `GCS_BUCKET_LOCATION`: Region (default: `us-central1`) +## Configuration + +### Resource Scaling on Google Batch + +Google Batch does NOT automatically scale machine types based on CPU/memory requests. This repository includes automatic resource scaling configuration in `config/google_batch_resources.config`. + +**Key points**: +- Each process that needs more than default resources must explicitly specify a matching `machineType` +- Processes now use `task.cpus` for dynamic thread allocation +- Configure resources using process labels or names + +See [docs/google-batch-resource-scaling.md](docs/google-batch-resource-scaling.md) for detailed guidance. + ## Additional Resources **Original AMR++ Repository**: [https://github.com/Microbial-Ecology-Group/AMRplusplus](https://github.com/Microbial-Ecology-Group/AMRplusplus) diff --git a/nextflow/amr-on-wb/config/google_batch_resources.config b/nextflow/amr-on-wb/config/google_batch_resources.config new file mode 100644 index 0000000..833bdb9 --- /dev/null +++ b/nextflow/amr-on-wb/config/google_batch_resources.config @@ -0,0 +1,38 @@ +/* + * Google Batch Resource Configuration + * + * This config file defines process-specific resource requirements + * and corresponding machine types for Google Batch execution. + * + * Google Batch does NOT automatically scale machine types based on + * CPU/memory requests. You must explicitly set machineType for each + * process that needs more than the default resources. + */ + +process { + // Default for all processes + cpus = 4 + memory = '16 GB' + machineType = 'n2-standard-4' + + // Alignment processes - typically need more resources + withLabel: alignment { + cpus = 8 + memory = '32 GB' + machineType = 'n2-standard-8' + } + + // High-memory processes (if needed) + withName: 'bwa_align' { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' + } + + // Example: Even higher resource process + // withName: 'some_process' { + // cpus = 32 + // memory = '128 GB' + // machineType = 'n2-highmem-32' + // } +} diff --git a/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md b/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md new file mode 100644 index 0000000..9823c95 --- /dev/null +++ b/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md @@ -0,0 +1,176 @@ +# Google Batch Resource Scaling Guide + +## Problem + +When running Nextflow on Google Batch, processes that request more CPUs or memory than the default `machineType` will fail because: + +1. **Google Batch does NOT automatically scale machine types** based on CPU/memory requests +2. Setting `process.cpus` or `process.memory` alone is not enough +3. Each process that needs more resources must explicitly specify a matching `machineType` + +## Example Error + +If your default is `n2-standard-8` (8 CPUs) and a process requests 32 CPUs: + +```groovy +withName: 'bwa_align' { + cpus = 32 + memory = "128 GB" +} +``` + +The process will crash trying to run 32 threads on an 8-CPU machine. + +## Solution + +This repository now includes **automatic resource scaling** via `config/google_batch_resources.config`. + +### How It Works + +The config file maps processes to appropriate machine types: + +```groovy +process { + // Default for all processes + cpus = 4 + memory = '16 GB' + machineType = 'n2-standard-4' + + // Label-based configuration + withLabel: alignment { + cpus = 8 + memory = '32 GB' + machineType = 'n2-standard-8' + } + + // Process-specific configuration + withName: 'bwa_align' { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' + } +} +``` + +### Machine Type Selection Guide + +Choose machine types based on your process requirements: + +| vCPUs | Standard (4GB/vCPU) | High-Mem (8GB/vCPU) | High-CPU (0.9GB/vCPU) | +|-------|---------------------|---------------------|------------------------| +| 2 | n2-standard-2 | n2-highmem-2 | n2-highcpu-2 | +| 4 | n2-standard-4 | n2-highmem-4 | n2-highcpu-4 | +| 8 | n2-standard-8 | n2-highmem-8 | n2-highcpu-8 | +| 16 | n2-standard-16 | n2-highmem-16 | n2-highcpu-16 | +| 32 | n2-standard-32 | n2-highmem-32 | n2-highcpu-32 | +| 64 | n2-standard-64 | n2-highmem-64 | n2-highcpu-64 | +| 96 | n2-standard-96 | n2-highmem-96 | n2-highcpu-96 | + +## Usage + +### Method 1: Label-Based (Recommended) + +Add labels to your process definitions: + +```groovy +process bwa_align { + label "alignment" // Will use settings from 'withLabel: alignment' + + input: + ... +} +``` + +Then configure in `config/google_batch_resources.config`: + +```groovy +withLabel: alignment { + cpus = 16 + memory = '64 GB' + machineType = 'n2-standard-16' +} +``` + +### Method 2: Process-Specific + +For individual process tuning: + +```groovy +withName: 'specific_process' { + cpus = 32 + memory = '128 GB' + machineType = 'n2-highmem-32' +} +``` + +### Method 3: Dynamic Resources + +Use `task.cpus` in your process scripts to automatically use allocated resources: + +```groovy +process example { + script: + """ + bwa mem -t ${task.cpus} ... + samtools sort -@ ${task.cpus} ... + """ +} +``` + +This is better than hardcoding `${params.threads}` because it adapts to the configured resources. + +## Testing Resource Scaling + +To verify a process uses the correct machine type: + +1. Add a test process to your workflow: + +```groovy +process test_resources { + label "alignment" + + script: + """ + echo "Allocated CPUs: ${task.cpus}" + echo "Allocated Memory: ${task.memory}" + echo "Machine Type: ${task.machineType}" + nproc # Should match allocated CPUs + free -h # Should show allocated memory + """ +} +``` + +2. Check the Google Batch console to confirm the job used the expected machine type + +## Common Pitfalls + +### ❌ Don't do this: + +```groovy +// Setting CPUs without machineType +process.cpus = 32 // Will try to run on default 4-CPU machine! +``` + +### ✅ Do this instead: + +```groovy +withName: 'my_process' { + cpus = 32 + memory = '128 GB' + machineType = 'n2-standard-32' +} +``` + +## Cost Optimization + +- Start with smaller machine types and scale up only when needed +- Use labels to group similar processes +- Monitor your Cloud Billing to track costs per machine type +- Consider `n2-highcpu` for CPU-intensive, low-memory tasks +- Consider spot instances via `google.batch.maxSpotAttempts` (already enabled in this config) + +## Further Reading + +- [Nextflow Google Batch Executor](https://www.nextflow.io/docs/latest/google.html#google-batch) +- [Google Compute Engine Machine Types](https://cloud.google.com/compute/docs/machine-types) +- [Nextflow Process Directives](https://www.nextflow.io/docs/latest/process.html#directives) diff --git a/nextflow/amr-on-wb/modules/Alignment/bwa.nf b/nextflow/amr-on-wb/modules/Alignment/bwa.nf index 04827be..c3e8ab4 100755 --- a/nextflow/amr-on-wb/modules/Alignment/bwa.nf +++ b/nextflow/amr-on-wb/modules/Alignment/bwa.nf @@ -63,25 +63,25 @@ process bwa_align { script: if( deduped == "N") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam - ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam + ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam rm ${pair_id}_alignment.sam - ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam + ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam rm ${pair_id}_alignment.bam """ else if( deduped == "Y") """ - ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam - ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam + ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam + ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam rm ${pair_id}_alignment.sam - ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam + ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam rm ${pair_id}_alignment.bam - ${SAMTOOLS} fixmate -@ ${threads} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam - ${SAMTOOLS} sort -@ ${threads} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam + ${SAMTOOLS} fixmate -@ ${task.cpus} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam + ${SAMTOOLS} sort -@ ${task.cpus} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam rm ${pair_id}_alignment_sorted_fix.bam ${SAMTOOLS} rmdup -S ${pair_id}_alignment_sorted_fix.sorted.bam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_sorted_fix.sorted.bam - ${SAMTOOLS} view -@ ${threads} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam + ${SAMTOOLS} view -@ ${task.cpus} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam rm ${pair_id}_alignment_dedup.sam """ else @@ -110,14 +110,14 @@ process bwa_rm_contaminant_fq { path("${pair_id}.samtools.idxstats"), emit: host_rm_stats """ - ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} > ${pair_id}.host.sam - ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${threads} -o ${pair_id}.host.sorted.bam + ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${task.cpus} > ${pair_id}.host.sam + ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${task.cpus} -o ${pair_id}.host.sorted.bam rm ${pair_id}.host.sam ${SAMTOOLS} index ${pair_id}.host.sorted.bam && ${SAMTOOLS} idxstats ${pair_id}.host.sorted.bam > ${pair_id}.samtools.idxstats ${SAMTOOLS} view -h -f 12 -b ${pair_id}.host.sorted.bam -o ${pair_id}.host.sorted.removed.bam - ${SAMTOOLS} sort -n -@ ${threads} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam + ${SAMTOOLS} sort -n -@ ${task.cpus} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam ${SAMTOOLS} \ - fastq -@ ${threads} -c 6 \ + fastq -@ ${task.cpus} -c 6 \ ${pair_id}.host.resorted.removed.bam \ -1 ${pair_id}.non.host.R1.fastq.gz \ -2 ${pair_id}.non.host.R2.fastq.gz \ diff --git a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf index 9da3a05..3d581c2 100755 --- a/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf +++ b/nextflow/amr-on-wb/modules/Microbiome/kraken2.nf @@ -49,7 +49,7 @@ process runkraken { """ - ${KRAKEN2} --db ${krakendb} --confidence ${kraken_confidence} --paired ${reads[0]} ${reads[1]} --threads ${threads} --report ${sample_id}.conf_${kraken_confidence}.kraken.report > ${sample_id}.conf_${kraken_confidence}.kraken.raw + ${KRAKEN2} --db ${krakendb} --confidence ${kraken_confidence} --paired ${reads[0]} ${reads[1]} --threads ${task.cpus} --report ${sample_id}.conf_${kraken_confidence}.kraken.report > ${sample_id}.conf_${kraken_confidence}.kraken.raw cut -f 2,3 ${sample_id}.conf_${kraken_confidence}.kraken.raw > ${sample_id}.conf_${kraken_confidence}.kraken.krona """ diff --git a/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf b/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf index 17035f6..36a8b01 100755 --- a/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf +++ b/nextflow/amr-on-wb/modules/Microbiome/qiime2.nf @@ -47,7 +47,7 @@ process Qiime2Dada2 { path("rep-seqs.qza"), emit: rep_seqs """ - ${QIIME} dada2 denoise-paired --i-demultiplexed-seqs ${demux} --o-table dada-table.qza --o-representative-sequences rep-seqs.qza --p-trim-left-f ${p_trim_left_f} --p-trim-left-r ${p_trim_left_r} --p-trunc-len-f ${p_trunc_len_f} --p-trunc-len-r ${p_trunc_len_r} --p-n-threads ${threads} --verbose --o-denoising-stats denoise_stats + ${QIIME} dada2 denoise-paired --i-demultiplexed-seqs ${demux} --o-table dada-table.qza --o-representative-sequences rep-seqs.qza --p-trim-left-f ${p_trim_left_f} --p-trim-left-r ${p_trim_left_r} --p-trunc-len-f ${p_trunc_len_f} --p-trunc-len-r ${p_trunc_len_r} --p-n-threads ${task.cpus} --verbose --o-denoising-stats denoise_stats """ } diff --git a/nextflow/amr-on-wb/modules/Resistome/resistome.nf b/nextflow/amr-on-wb/modules/Resistome/resistome.nf index 87f1f6b..7b2b320 100755 --- a/nextflow/amr-on-wb/modules/Resistome/resistome.nf +++ b/nextflow/amr-on-wb/modules/Resistome/resistome.nf @@ -232,7 +232,7 @@ process runsnp { mv ${bam} ${sample_id}.bam fi - python3 SNP_Verification.py -c config.ini -t ${threads} -a true -i ${sample_id}.bam -o ${sample_id}.${prefix}_SNPs --count_matrix ${snp_count_matrix} --detailed_output = True + python3 SNP_Verification.py -c config.ini -t ${task.cpus} -a true -i ${sample_id}.bam -o ${sample_id}.${prefix}_SNPs --count_matrix ${snp_count_matrix} --detailed_output = True cut -d ',' -f `awk -v RS=',' "/${sample_id}/{print NR; exit}" ${sample_id}.${prefix}_SNPs${snp_count_matrix}` ${sample_id}.${prefix}_SNPs${snp_count_matrix} > ${sample_id}.${prefix}_SNP_count_col diff --git a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf index 1c07487..d642d56 100755 --- a/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf +++ b/nextflow/amr-on-wb/modules/Trimming/trimmomatic.nf @@ -42,7 +42,7 @@ process runqc { """ ${TRIMMOMATIC} \ PE \ - -threads ${threads} \ + -threads ${task.cpus} \ ${reads[0]} ${reads[1]} ${sample_id}.1P.fastq.gz ${sample_id}.1U.fastq.gz ${sample_id}.2P.fastq.gz ${sample_id}.2U.fastq.gz \ ILLUMINACLIP:${adapters_file}:2:30:10:3:TRUE \ LEADING:${leading} \ diff --git a/nextflow/amr-on-wb/nextflow.config b/nextflow/amr-on-wb/nextflow.config index ef38db2..b6b36f4 100755 --- a/nextflow/amr-on-wb/nextflow.config +++ b/nextflow/amr-on-wb/nextflow.config @@ -75,11 +75,12 @@ profiles { singularity.cacheDir = "$baseDir/envs/" } 'google-batch' { + includeConfig "config/google_batch_resources.config" + process.executor = 'google-batch' process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } process.maxRetries = 5 - process.machineType = 'n2-standard-4' // 16 Gbps network for faster image pulls workDir = "gs://${GCS_BUCKET}/scratch" @@ -98,11 +99,12 @@ profiles { executor.queueStatInterval = '2m' } workbench { + includeConfig "config/google_batch_resources.config" + process.executor = 'google-batch' process.container = "us-central1-docker.pkg.dev/${GOOGLE_CLOUD_PROJECT}/${GOOGLE_ARTIFACT_REPO}/amrplusplus-workbench:latest" process.errorStrategy = { task.exitStatus==50001 ? 'retry' : 'terminate' } process.maxRetries = 5 - process.machineType = 'n2-standard-4' // 16 Gbps network for faster image pulls workDir = "gs://${GCS_BUCKET}/scratch"