verily-src · samhornstein · Jan 9, 2026 · Jan 9, 2026 · Jan 30, 2026 · Feb 5, 2026
diff --git a/nextflow/amr-on-wb/.gitignore b/nextflow/amr-on-wb/.gitignore
@@ -0,0 +1,11 @@
+.nextflow/
+work/
+.nextflow.log*
+
+bin/AMRplusplus_SNP/
+test_results/
+
+.ipynb_checkpoints
+
+scripts/config/*.env
+!scripts/config/*.env.template
diff --git a/nextflow/amr-on-wb/README.md b/nextflow/amr-on-wb/README.md
diff --git a/nextflow/amr-on-wb/config/google_batch_resources.config b/nextflow/amr-on-wb/config/google_batch_resources.config
@@ -0,0 +1,38 @@
+/*
+ * Google Batch Resource Configuration
+ *
+ * This config file defines process-specific resource requirements
+ * and corresponding machine types for Google Batch execution.
+ *
+ * Google Batch does NOT automatically scale machine types based on
+ * CPU/memory requests. You must explicitly set machineType for each
+ * process that needs more than the default resources.
+ */
+
+process {
+    // Default for all processes
+    cpus = 4
+    memory = '16 GB'
+    machineType = 'n2-standard-4'
+
+    // Alignment processes - typically need more resources
+    withLabel: alignment {
+        cpus = 8
+        memory = '32 GB'
+        machineType = 'n2-standard-8'
+    }
+
+    // High-memory processes (if needed)
+    withName: 'bwa_align' {
+        cpus = 16
+        memory = '64 GB'
+        machineType = 'n2-standard-16'
+    }
+
+    // Example: Even higher resource process
+    // withName: 'some_process' {
+    //     cpus = 32
+    //     memory = '128 GB'
+    //     machineType = 'n2-highmem-32'
+    // }
+}
diff --git a/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md b/nextflow/amr-on-wb/docs/google-batch-resource-scaling.md
@@ -0,0 +1,176 @@
+# Google Batch Resource Scaling Guide
+
+## Problem
+
+When running Nextflow on Google Batch, processes that request more CPUs or memory than the default `machineType` will fail because:
+
+1. **Google Batch does NOT automatically scale machine types** based on CPU/memory requests
+2. Setting `process.cpus` or `process.memory` alone is not enough
+3. Each process that needs more resources must explicitly specify a matching `machineType`
+
+## Example Error
+
+If your default is `n2-standard-8` (8 CPUs) and a process requests 32 CPUs:
+
+```groovy
+withName: 'bwa_align' {
+    cpus = 32
+    memory = "128 GB"
+}
+```
+
+The process will crash trying to run 32 threads on an 8-CPU machine.
+
+## Solution
+
+This repository now includes **automatic resource scaling** via `config/google_batch_resources.config`.
+
+### How It Works
+
+The config file maps processes to appropriate machine types:
+
+```groovy
+process {
+    // Default for all processes
+    cpus = 4
+    memory = '16 GB'
+    machineType = 'n2-standard-4'
+
+    // Label-based configuration
+    withLabel: alignment {
+        cpus = 8
+        memory = '32 GB'
+        machineType = 'n2-standard-8'
+    }
+
+    // Process-specific configuration
+    withName: 'bwa_align' {
+        cpus = 16
+        memory = '64 GB'
+        machineType = 'n2-standard-16'
+    }
+}
+```
+
+### Machine Type Selection Guide
+
+Choose machine types based on your process requirements:
+
+| vCPUs | Standard (4GB/vCPU) | High-Mem (8GB/vCPU) | High-CPU (0.9GB/vCPU) |
+|-------|---------------------|---------------------|------------------------|
+| 2     | n2-standard-2       | n2-highmem-2        | n2-highcpu-2          |
+| 4     | n2-standard-4       | n2-highmem-4        | n2-highcpu-4          |
+| 8     | n2-standard-8       | n2-highmem-8        | n2-highcpu-8          |
+| 16    | n2-standard-16      | n2-highmem-16       | n2-highcpu-16         |
+| 32    | n2-standard-32      | n2-highmem-32       | n2-highcpu-32         |
+| 64    | n2-standard-64      | n2-highmem-64       | n2-highcpu-64         |
+| 96    | n2-standard-96      | n2-highmem-96       | n2-highcpu-96         |
+
+## Usage
+
+### Method 1: Label-Based (Recommended)
+
+Add labels to your process definitions:
+
+```groovy
+process bwa_align {
+    label "alignment"  // Will use settings from 'withLabel: alignment'
+
+    input:
+    ...
+}
+```
+
+Then configure in `config/google_batch_resources.config`:
+
+```groovy
+withLabel: alignment {
+    cpus = 16
+    memory = '64 GB'
+    machineType = 'n2-standard-16'
+}
+```
+
+### Method 2: Process-Specific
+
+For individual process tuning:
+
+```groovy
+withName: 'specific_process' {
+    cpus = 32
+    memory = '128 GB'
+    machineType = 'n2-highmem-32'
+}
+```
+
+### Method 3: Dynamic Resources
+
+Use `task.cpus` in your process scripts to automatically use allocated resources:
+
+```groovy
+process example {
+    script:
+    """
+    bwa mem -t ${task.cpus} ...
+    samtools sort -@ ${task.cpus} ...
+    """
+}
+```
+
+This is better than hardcoding `${params.threads}` because it adapts to the configured resources.
+
+## Testing Resource Scaling
+
+To verify a process uses the correct machine type:
+
+1. Add a test process to your workflow:
+
+```groovy
+process test_resources {
+    label "alignment"
+
+    script:
+    """
+    echo "Allocated CPUs: ${task.cpus}"
+    echo "Allocated Memory: ${task.memory}"
+    echo "Machine Type: ${task.machineType}"
+    nproc  # Should match allocated CPUs
+    free -h  # Should show allocated memory
+    """
+}
+```
+
+2. Check the Google Batch console to confirm the job used the expected machine type
+
+## Common Pitfalls
+
+### ❌ Don't do this:
+
+```groovy
+// Setting CPUs without machineType
+process.cpus = 32  // Will try to run on default 4-CPU machine!
+```
+
+### ✅ Do this instead:
+
+```groovy
+withName: 'my_process' {
+    cpus = 32
+    memory = '128 GB'
+    machineType = 'n2-standard-32'
+}
+```
+
+## Cost Optimization
+
+- Start with smaller machine types and scale up only when needed
+- Use labels to group similar processes
+- Monitor your Cloud Billing to track costs per machine type
+- Consider `n2-highcpu` for CPU-intensive, low-memory tasks
+- Consider spot instances via `google.batch.maxSpotAttempts` (already enabled in this config)
+
+## Further Reading
+
+- [Nextflow Google Batch Executor](https://www.nextflow.io/docs/latest/google.html#google-batch)
+- [Google Compute Engine Machine Types](https://cloud.google.com/compute/docs/machine-types)
+- [Nextflow Process Directives](https://www.nextflow.io/docs/latest/process.html#directives)
diff --git a/nextflow/amr-on-wb/envs/AMR++_env.yaml b/nextflow/amr-on-wb/envs/AMR++_env.yaml
@@ -16,5 +16,5 @@ dependencies:
   - pandas
   - trimmomatic
   - biopython
-  - nextflow
+  - nextflow = 24
   - git
diff --git a/nextflow/amr-on-wb/envs/containers/Dockerfile b/nextflow/amr-on-wb/envs/containers/Dockerfile
@@ -20,6 +20,7 @@ RUN apt-get update -q && \
         subversion \
         wget \
         g++ \
+        make \
         libarchive13 \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -59,5 +60,14 @@ RUN set -x && \
     find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
     /opt/conda/bin/conda clean -afy && \
     /opt/conda/bin/conda install -c conda-forge mamba && \
-    /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow && \
+    /opt/conda/bin/mamba install -c conda-forge -c bioconda git python=3.9 trimmomatic multiqc bwa samtools bedtools kraken2 multiqc fastqc krona bracken numpy pysam pandas biopython matplotlib nextflow=24 && \
     conda clean --all
+
+# Copy AMR++ bin scripts into the container
+COPY bin /opt/amrplusplus/bin
+RUN chmod +x /opt/amrplusplus/bin/*.py && \
+    chmod +x /opt/amrplusplus/bin/rarefaction && \
+    chmod +x /opt/amrplusplus/bin/resistome
+
+# Add bin directory to PATH
+ENV PATH="/opt/amrplusplus/bin:${PATH}"
diff --git a/nextflow/amr-on-wb/modules/Alignment/bwa.nf b/nextflow/amr-on-wb/modules/Alignment/bwa.nf
@@ -53,8 +53,8 @@ process bwa_align {
         }
 
     input:
-        path indexfiles 
-        tuple val(pair_id), path(reads) 
+        path indexfiles
+        tuple val(pair_id), path(reads)
 
     output:
         tuple val(pair_id), path("${pair_id}_alignment_sorted.bam"), emit: bwa_bam
@@ -63,25 +63,25 @@ process bwa_align {
     script:
     if( deduped == "N")
         """
-        ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
-        ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
+        ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
+        ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
         rm ${pair_id}_alignment.sam
-        ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
+        ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
         rm ${pair_id}_alignment.bam
         """
     else if( deduped == "Y")
         """
-        ${BWA} mem ${indexfiles[0]} ${reads} -t ${threads} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
-        ${SAMTOOLS} view -@ ${threads} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
+        ${BWA} mem ${indexfiles[0]} ${reads} -t ${task.cpus} -R '@RG\\tID:${pair_id}\\tSM:${pair_id}' > ${pair_id}_alignment.sam
+        ${SAMTOOLS} view -@ ${task.cpus} -S -b ${pair_id}_alignment.sam > ${pair_id}_alignment.bam
         rm ${pair_id}_alignment.sam
-        ${SAMTOOLS} sort -@ ${threads} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
+        ${SAMTOOLS} sort -@ ${task.cpus} -n ${pair_id}_alignment.bam -o ${pair_id}_alignment_sorted.bam
         rm ${pair_id}_alignment.bam
-        ${SAMTOOLS} fixmate -@ ${threads} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam
-        ${SAMTOOLS} sort -@ ${threads} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam
+        ${SAMTOOLS} fixmate -@ ${task.cpus} ${pair_id}_alignment_sorted.bam ${pair_id}_alignment_sorted_fix.bam
+        ${SAMTOOLS} sort -@ ${task.cpus} ${pair_id}_alignment_sorted_fix.bam -o ${pair_id}_alignment_sorted_fix.sorted.bam
         rm ${pair_id}_alignment_sorted_fix.bam
         ${SAMTOOLS} rmdup -S ${pair_id}_alignment_sorted_fix.sorted.bam ${pair_id}_alignment_dedup.bam
         rm ${pair_id}_alignment_sorted_fix.sorted.bam
-        ${SAMTOOLS} view -@ ${threads} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam
+        ${SAMTOOLS} view -@ ${task.cpus} -h -o ${pair_id}_alignment_dedup.sam ${pair_id}_alignment_dedup.bam
         rm ${pair_id}_alignment_dedup.sam
         """
     else
@@ -93,8 +93,8 @@ process bwa_rm_contaminant_fq {
     label "alignment"
 
     errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
-    maxRetries 3 
- 
+    maxRetries 3
+
     publishDir "${params.output}/HostRemoval", mode: "copy",
         saveAs: { filename ->
             if(filename.indexOf("fastq.gz") > 0) "NonHostFastq/$filename"
@@ -103,21 +103,21 @@ process bwa_rm_contaminant_fq {
 
     input:
     path indexfiles
-    tuple val(pair_id), path(reads) 
+    tuple val(pair_id), path(reads)
 
     output:
     tuple val(pair_id), path("${pair_id}.non.host.R*.fastq.gz"), emit: nonhost_reads
     path("${pair_id}.samtools.idxstats"), emit: host_rm_stats
-    
+
     """
-    ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${threads} > ${pair_id}.host.sam
-    ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${threads} -o ${pair_id}.host.sorted.bam
+    ${BWA} mem ${indexfiles[0]} ${reads[0]} ${reads[1]} -t ${task.cpus} > ${pair_id}.host.sam
+    ${SAMTOOLS} view -bS ${pair_id}.host.sam | ${SAMTOOLS} sort -@ ${task.cpus} -o ${pair_id}.host.sorted.bam
     rm ${pair_id}.host.sam
     ${SAMTOOLS} index ${pair_id}.host.sorted.bam && ${SAMTOOLS} idxstats ${pair_id}.host.sorted.bam > ${pair_id}.samtools.idxstats
     ${SAMTOOLS} view -h -f 12 -b ${pair_id}.host.sorted.bam -o ${pair_id}.host.sorted.removed.bam
-    ${SAMTOOLS} sort -n -@ ${threads} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam
+    ${SAMTOOLS} sort -n -@ ${task.cpus} ${pair_id}.host.sorted.removed.bam -o ${pair_id}.host.resorted.removed.bam
     ${SAMTOOLS}  \
-       fastq -@ ${threads} -c 6  \
+       fastq -@ ${task.cpus} -c 6  \
       ${pair_id}.host.resorted.removed.bam \
       -1 ${pair_id}.non.host.R1.fastq.gz \
       -2 ${pair_id}.non.host.R2.fastq.gz \
@@ -147,6 +147,6 @@ process HostRemovalStats {
         path("host.removal.stats"), emit: combo_host_rm_stats
 
     """
-    ${PYTHON3} $baseDir/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
+    ${PYTHON3} /opt/amrplusplus/bin/samtools_idxstats.py -i ${host_rm_stats} -o host.removal.stats
     """
 }