diff --git a/CADD.sh b/CADD.sh index 5401eed..05e5fd4 100755 --- a/CADD.sh +++ b/CADD.sh @@ -14,7 +14,9 @@ where: -q print basic information about snakemake run -p print full information about the snakemake run -d do not remove temporary directory for debug puroposes + -t specify location for temporary directory [default: /tmp/] -c number of cores that snakemake is allowed to use [default: 1] + -M maximum memory that snakemake is allowed to use [default: available memory] " unset OPTARG @@ -30,7 +32,19 @@ SIGNULARITYARGS="" VERBOSE="-q" CORES="1" RM_TMP_DIR=true -while getopts ':ho:g:v:c:amr:qpd' option; do +# set memory to available memory by default (can be lower than system memory in docker containers): +if [[ -f "/sys/fs/cgroup/memory/memory.limit_in_bytes" ]]; then + MEMORY=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) + MEMORY=$((MEMORY / 1024 / 1024 / 1024)) +elif [[ -f "/sys/fs/cgroup/memory.max" ]]; then + MEMORY=$(cat /sys/fs/cgroup/memory.max) + MEMORY=$((MEMORY / 1024 / 1024 / 1024)) +else + # if files are not available, determine in snakemake + MEMORY="0" +fi + +while getopts ':ho:g:v:c:M:amr:qpdt:' option; do case "$option" in h) echo "$usage" exit @@ -43,6 +57,8 @@ while getopts ':ho:g:v:c:amr:qpd' option; do ;; c) CORES=$OPTARG ;; + M) MAXMEMORY=$OPTARG + ;; a) ANNOTATION=true ;; m) MAMBAONLY=true @@ -55,6 +71,8 @@ while getopts ':ho:g:v:c:amr:qpd' option; do ;; d) RM_TMP_DIR=false ;; + t) TMP_PREFIX=$OPTARG + ;; \?) printf "illegal option: -%s\n" "$OPTARG" >&2 echo "$usage" >&2 exit 1 @@ -80,6 +98,7 @@ then exit 1 fi +## check if temp folder was specified with ### Configuring all the paths FILENAME=$(basename $INFILE) @@ -123,7 +142,7 @@ fi # Setup temporary folder that is removed reliably on exit and is outside of # the CADD-scripts directory. -TMP_FOLDER=$(mktemp -d) +TMP_FOLDER=$(mktemp -d -p $TMP_PREFIX) if [ "$RM_TMP_DIR" = 'true' ] then trap "rm -rf $TMP_FOLDER" ERR EXIT @@ -145,9 +164,12 @@ fi echo "Running snakemake pipeline:" +# resources is added to divide total load wrt number of cores & gpus command="snakemake $TMP_OUTFILE \ + --resources cpu_load=100 --resources gpu_load=100 \ --sdm conda $SIGNULARITYARGS --conda-prefix $CADD/envs/conda \ --cores $CORES --configfile $CONFIG \ + --config mem_gb=$MEMORY \ --snakefile $CADD/Snakefile $VERBOSE" echo -e $command diff --git a/Dockerfile b/Dockerfile index e40171f..ce7e16a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,67 @@ -FROM condaforge/mambaforge:latest -LABEL io.github.snakemake.containerized="true" -LABEL io.github.snakemake.conda_env_hash="cb2c51dd0ad3df620c4914840c5ef6f5570a5ffd8cfd54cec57d2ffef0a76b08" +###################### +# aws output handler # +###################### -# Step 1: Retrieve conda environments +# includes: +# - the cmg modules +# - dependencies -RUN mkdir -p /conda-envs/a4fcaaffb623ea8aef412c66280bd623 -COPY envs/environment_minimal.yml /conda-envs/a4fcaaffb623ea8aef412c66280bd623/environment.yaml +FROM ubuntu:24.04 -RUN mkdir -p /conda-envs/ef25c8d726aebbe9e0ee64fee6c3caa9 -COPY envs/esm.yml /conda-envs/ef25c8d726aebbe9e0ee64fee6c3caa9/environment.yaml +## needed apt packages +ARG BUILD_PACKAGES="wget git ssh bzip2 curl axel" +# needed conda packages (only packages not in the requirements of cmg-package) +ARG CONDA_PACKAGES="python==3.12.3 snakemake==8.16.0" +ENV MAMBA_ROOT_PREFIX=/opt/conda/ +ENV PATH /opt/micromamba/bin:/opt/conda/bin:$PATH +# ADD credentials on build +ARG SSH_PRIVATE_KEY +## ENV SETTINGS during runtime +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV PATH=/opt/conda/bin:/opt/CADD-scripts/:$PATH +ENV DEBIAN_FRONTEND noninteractive +ENV CADD=/opt/CADD-scripts +SHELL ["/bin/bash", "-l", "-c"] -RUN mkdir -p /conda-envs/7f88b844a05ae487b7bb6530b5e6a90c -COPY envs/mmsplice.yml /conda-envs/7f88b844a05ae487b7bb6530b5e6a90c/environment.yaml +# install base packages +RUN echo "Acquire::http::Pipeline-Depth 0;" > /etc/apt/apt.conf.d/99fixbadproxy && \ + echo "Acquire::http::No-Cache true;" >> /etc/apt/apt.conf.d/99fixbadproxy && \ + echo "Acquire::BrokenProxy true;" >> /etc/apt/apt.conf.d/99fixbadproxy && \ + apt-get -y update && \ + apt-get -y upgrade && \ + apt-get install -y $BUILD_PACKAGES && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -RUN mkdir -p /conda-envs/dfc51ced08aaeb4cbd3dcd509dec0fc5 -COPY envs/regulatorySequence.yml /conda-envs/dfc51ced08aaeb4cbd3dcd509dec0fc5/environment.yaml +# Install conda/miniforge3 +RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \ + /bin/bash Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda && \ + rm Miniforge3-$(uname)-$(uname -m).sh && \ + mamba install -y -c conda-forge -c bioconda $CONDA_PACKAGES && \ + conda clean --tarballs --index-cache --packages --yes && \ + conda config --set channel_priority strict && \ + echo ". /opt/conda/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ + echo ". /opt/conda/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -RUN mkdir -p /conda-envs/89fe1049cc18768b984c476c399b7989 -COPY envs/vep.yml /conda-envs/89fe1049cc18768b984c476c399b7989/environment.yaml +# install cadd & run test file to generate all envs +RUN cd /opt && \ + git clone --branch Fix/max_memory https://github.com/geertvandeweyer/CADD-scripts.git + #cd CADD-scripts && \ + #snakemake test/input.vcf \ + # --software-deployment-method conda \ + ## --conda-create-envs-only \ + # --conda-prefix envs/conda \ + # --configfile config/config_GRCh38_v1.7.yml \ + # --snakefile Snakefile -c 1 -# Step 2: Generate conda environments +#COPY Install_Annotations.sh /opt/CADD-scripts/Install_Annotations.sh +RUN chmod a+x /opt/CADD-scripts/Install_Annotations.sh -RUN mamba env create --prefix /conda-envs/a4fcaaffb623ea8aef412c66280bd623 --file /conda-envs/a4fcaaffb623ea8aef412c66280bd623/environment.yaml && \ - mamba env create --prefix /conda-envs/ef25c8d726aebbe9e0ee64fee6c3caa9 --file /conda-envs/ef25c8d726aebbe9e0ee64fee6c3caa9/environment.yaml && \ - mamba env create --prefix /conda-envs/7f88b844a05ae487b7bb6530b5e6a90c --file /conda-envs/7f88b844a05ae487b7bb6530b5e6a90c/environment.yaml && \ - mamba env create --prefix /conda-envs/dfc51ced08aaeb4cbd3dcd509dec0fc5 --file /conda-envs/dfc51ced08aaeb4cbd3dcd509dec0fc5/environment.yaml && \ - mamba env create --prefix /conda-envs/89fe1049cc18768b984c476c399b7989 --file /conda-envs/89fe1049cc18768b984c476c399b7989/environment.yaml && \ - mamba clean --all -y +## some follow up instructions are needed: +RUN echo "WARNING: CADD-scripts installed. To use the container, the following commands are needed: " +RUN echo "# download the annotations sources" +RUN echo "docker run -v /mnt/CADD_data:/opt/CADD-scripts/data my-cadd-scripts:my_version /opt/CADD-Scripts/Install_Annotations.sh /opt/CADD-scripts/data GRCh38" +RUN echo "# run the script on the test data to prepare all conda envs" +RUN echo "docker run --name prep-container -w /opt/CADD-scripts -v /mnt/CADD_data/annotations:/opt/CADD-scripts/data/annotations -v /mnt/CADD_data/prescored:/opt/CADD-scripts/data/prescored my-cadd-scripts:my_version bash -c 'snakemake test/input.tsv.gz --resources load=100 --sdm conda --conda-prefix /opt/CADD-scripts/envs/conda --configfile /opt/CADD-scripts/config/config_GRCh38_v1.7_noanno.yml --snakefile /opt/CADD-scripts/Snakefile -c 1 ; rm -Rf /opt/CADD-scripts/test/input_splits /opt/CADD-scripts/test/input.chunk* /opt/CADD-scripts/test/input.*.log /opt/conda/pkgs/*' " +RUN echo "# commit the changes to the image" +RUN echo "docker commit prep-container my-cadd-scripts:my_version" \ No newline at end of file diff --git a/Install_Annotations.sh b/Install_Annotations.sh new file mode 100644 index 0000000..01dee7f --- /dev/null +++ b/Install_Annotations.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# need two arguments: +# 1. target +# 2. build +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi +TARGET=$1 +BUILD=$2 + +# LOCATIONS: +DOWNLOAD_LOCATION=https://krishna.gs.washington.edu/download/CADD + + + +# supported builds: GRCh37, GRCh38 +if [ "$BUILD" != "GRCh37" ] && [ "$BUILD" != "GRCh38" ]; then + echo "Usage: $0 " + echo "Supported builds: GRCh37, GRCh38" + exit 1 +fi + +## ANNOTATIONS +echo "1. ANNOTATIONS" +mkdir -p $TARGET/annotations/ +cd $TARGET/annotations/ +URL="$DOWNLOAD_LOCATION/v1.7/$BUILD/${BUILD}_v1.7.tar.gz" +echo " - download" +axel -a "$URL" +axel -a "$URL.md5" +echo " - md5sum" +md5sum -c ${BUILD}_v1.7.tar.gz.md5 +echo " - untar" +tar -xzvf ${BUILD}_v1.7.tar.gz +rm ${BUILD}_v1.7.tar.gz +rm ${BUILD}_v1.7.tar.gz.md5 + +## PRESCORED +echo "2. PRESCORED" +mkdir -p $TARGET/prescored/${BUILD}_v1.7/noanno/ +cd $TARGET/prescored/${BUILD}_v1.7/noanno/ +URL="$DOWNLOAD_LOCATION/v1.7/$BUILD/whole_genome_SNVs.tsv.gz" +echo " - download" +axel -a "$URL" +axel -a "$URL.md5" +axel -a "$URL.tbi" +axel -a "$URL.tbi.md5" +URL="$DOWNLOAD_LOCATION/v1.7/$BUILD/gnomad.genomes.r4.0.indel.tsv.gz" +axel -a "$URL" +axel -a "$URL.md5" +axel -a "$URL.tbi" +axel -a "$URL.tbi.md5" +echo " - md5sum" +md5sum -c *.md5 +rm *.md5 + + + + diff --git a/Snakefile b/Snakefile index d53587e..b22bc65 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,8 @@ Note that we are declaring many files temporary here that should be located in a # container with conda environments containerized: "docker://visze/cadd-scripts-v1_7:0.1.0" +# need glob to get chunked files +import psutil # Min version of snakemake from snakemake.utils import min_version @@ -20,24 +22,97 @@ validate(config, schema="schemas/config_schema.yaml") # CADD environment variable import os +################################### +## RULE SPECFIC THREADING LIMITS ## +################################### +if int(config.get("mem_gb",0)) == 0: + system_memory = psutil.virtual_memory().total / (1024 ** 3) +else: + system_memory = int(config["mem_gb"]) +try: + lines = subprocess.check_output("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null", shell=True).decode('utf-8').splitlines() + # sum over gpu(s) + gpu_memory = int(sum([int(x) for x in lines])/1024) + # nr of gpus + gpu_count = len(lines) +except Exception as e: + gpu_memory = 0 + gpu_count = 0 + pass + +############### +## GPU TASKS ## +############### +# esm takes ~16 Gb of system memory per tread & 14 Gb of GPU ram (if available). +# if gpu is available, we can use it for esm : one thread per gpu. +if gpu_count > 0: + config['esm_cpu_slots'] = int(max(1,(0.9*system_memory)/16)) + config['esm_gpu_slots'] = int(0.9*gpu_memory/14) +else: + config['esm_cpu_slots'] = int(max(1,0.9*system_memory / 16)) + config['esm_gpu_slots'] = 0 +config['esm_cpu_load'] = int(100/config['esm_cpu_slots']) +config['esm_gpu_load'] = 100 if config['esm_gpu_slots'] < 1 else int(100/config['esm_gpu_slots']) +config['esm_cpu_threads'] = max(1, int(workflow.cores / config['esm_cpu_slots'])) + +############### +## CPU TASKS ## +############### +# then assign other resources +config['vep_cpu_load'] = 100 if system_memory < 4 else int(100/int(0.9*system_memory / 4 )) # up to 4Gb/ram +config['vep_cpu_threads'] = max(1, int(workflow.cores / (100/config['vep_cpu_load']))) +config['regseq_cpu_load'] = 100 if system_memory < 2 else int(100/int(0.9*system_memory / 2 )) # up to 2Gb of ram +config['mms_cpu_load'] = 100 if system_memory < 16 else int(100/int(0.9*system_memory / 16 )) # up to 16Gb/ram +config['mms_cpu_threads'] = max(1, int(workflow.cores / (100/config['mms_cpu_load']))) +config['anno_cpu_load'] = 1 # disk IO intensive +config['impute_cpu_load'] = 1 +config['prescore_cpu_load'] = 10 # disk IO intensive +config['score_cpu_load'] = 1 + +print("Threading Overview") +print("##################") +print("Assigned cores: {}".format(workflow.cores)) +print("Available system memory: {}GB".format(int(system_memory))) +if gpu_memory > 0: + print("Total GPU memory: {}GB".format(gpu_memory)) +else: + print("No gpu found") + +# print threading limits +print("Task Parallelization: ") +print(" PreScore : {}x".format(min(workflow.cores,int(100/config['prescore_cpu_load'])))) +print(" VEP : {}x with {} threads each".format(min(workflow.cores,int(100/config['vep_cpu_load'])), config['vep_cpu_threads'])) +print(" ESM : CPU : {}x with {} threads each (memory constraints)".format(min(workflow.cores,config['esm_cpu_slots']),config['esm_cpu_threads'])) +print(" ESM : GPU : {}x (gpu memory constraints)".format(config['esm_gpu_slots'])) +print(" RegSeq : {}x".format(min(workflow.cores,int(100/config['regseq_cpu_load'])))) +print(" MMsplice : {}x with {} threads each (memory constraints)".format(min(workflow.cores,int(100/config['mms_cpu_load'])),config['mms_cpu_threads'])) +print(" Annotate : {}x".format(min(workflow.cores,int(100/config['anno_cpu_load'])))) +print(" Impute : {}x".format(min(workflow.cores,int(100/config['impute_cpu_load'])))) + +# flush output before starting workflow +print("Starting workflow",flush=True) + + +## allowed scattering +scattergather: + split=workflow.cores, envvars: "CADD", -# wildcard_constraints: -# file=".*(?> {log} + cat {input.vcf} \ | python {params.cadd}/src/scripts/VCF2vepVCF.py \ + | (grep -v '^#' || true) \ + | sed 's/^chr//' \ | sort -k1,1 -k2,2n -k4,4 -k5,5 \ - | uniq > {output} 2> {log} + | uniq > {wildcards.file}_splits/full.vcf 2> {log} + + # split + LC=$(wc -l {wildcards.file}_splits/full.vcf | cut -f1 -d' ') + ## if zero, stop the workflow here (as success) + if [[ "$LC" -eq 0 ]]; then + echo "No variants to process. Exiting." >> {log} + # put expected files in place for this step (generated by split and the for loop below) + touch {wildcards.file}_splits/chunk_1-of-{params.threads}.prepared.vcf + exit 0 + fi + + LC=$(((LC / {params.threads})+1)) + + split -l $LC --numeric-suffixes=1 --additional-suffix="-of-{params.threads}.prepared.vcf" {wildcards.file}_splits/full.vcf {wildcards.file}_splits/chunk_ 2>> {log} + + rm -f {wildcards.file}_splits/full.vcf + + # strip padding zeros in the file names + for f in {wildcards.file}_splits/chunk_*.prepared.vcf + do + target=$(echo "$f" | sed -E 's/(chunk_)0*([1-9][0-9]*)(-of-{params.threads}\\.prepared\\.vcf)/\\1\\2\\3/') + if [ "$f" != "$target" ]; then + mv -n "$f" "$target" + fi + + done """ @@ -70,30 +181,38 @@ checkpoint prescore: conda: "envs/environment_minimal.yml" input: - vcf="{file}.prepared.vcf", + vcf="{file}_splits/chunk_{chunk}.prepared.vcf", prescored="%s/%s" % (os.environ["CADD"], config["PrescoredFolder"]), output: - novel=temp("{file}.novel.vcf"), - prescored=temp("{file}.pre.tsv"), + novel="{file}_splits/chunk_{chunk}.novel.vcf", + prescored="{file}_splits/chunk_{chunk}.pre.tsv", log: - "{file}.prescore.log", + "{file}.chunk_{chunk}.prescore.log", params: cadd=os.environ["CADD"], + resources: + # < 1GB of memory + cpu_load=int(config['prescore_cpu_load']), shell: """ # Prescoring echo '## Prescored variant file' > {output.prescored} 2> {log}; + # are prescored files available? PRESCORED_FILES=`find -L {input.prescored} -maxdepth 1 -type f -name \\*.tsv.gz | wc -l` cp {input.vcf} {input.vcf}.new if [ ${{PRESCORED_FILES}} -gt 0 ]; then + # loop over all prescored files: snv , indel, ... for PRESCORED in $(ls {input.prescored}/*.tsv.gz) do + # extract writes found to outfile, not-found to stdout cat {input.vcf}.new \ | python {params.cadd}/src/scripts/extract_scored.py --header \ -p $PRESCORED --found_out={output.prescored}.tmp \ > {input.vcf}.tmp 2>> {log}; + # get prescored to the outfile cat {output.prescored}.tmp >> {output.prescored} + # put not-found back to the input mv {input.vcf}.tmp {input.vcf}.new &> {log}; done; rm {output.prescored}.tmp &>> {log} @@ -106,16 +225,22 @@ rule annotation_vep: conda: "envs/vep.yml" input: - vcf="{file}.novel.vcf", + vcf="{file}_splits/chunk_{chunk}.novel.vcf", veppath="%s/%s" % (os.environ["CADD"], config["VEPpath"]), output: - temp("{file}.vep.vcf.gz"), + "{file}_splits/chunk_{chunk}.vep.vcf.gz", log: - "{file}.annotation_vep.log", + "{file}.chunk_{chunk}.annotation_vep.log", params: cadd=os.environ["CADD"], genome_build=config["GenomeBuild"], ensembl_db=config["EnsemblDB"], + threads=config['vep_cpu_threads'], + resources: + # < 1GB of memory + cpu_load=int(config['vep_cpu_load']), + threads: + config['vep_cpu_threads'], shell: """ cat {input.vcf} \ @@ -124,6 +249,7 @@ rule annotation_vep: --db_version={params.ensembl_db} --assembly {params.genome_build} \ --format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains \ --numbers --canonical --total_length --vcf --force_overwrite --output_file STDOUT \ + --fork {params.threads} \ | bgzip -c > {output} 2> {log} """ @@ -132,7 +258,8 @@ rule annotate_esm: conda: "envs/esm.yml" input: - vcf="{file}.vep.vcf.gz", + #vcf="{file}_splits/chunk_{chunk}.vep.vcf.gz", + vcf="{file}_splits/chunk_{chunk}.vep.vcf.gz", models=expand( "{path}/{model}.pt", path=config["ESMpath"], @@ -140,15 +267,22 @@ rule annotate_esm: ), transcripts="%s/pep.%s.fa" % (config["ESMpath"], config["EnsemblDB"]), output: - missens=temp("{file}.esm_missens.vcf.gz"), - frameshift=temp("{file}.esm_frameshift.vcf.gz"), - final=temp("{file}.esm.vcf.gz"), + missens="{file}_splits/chunk_{chunk}.esm_missens.vcf.gz", + frameshift="{file}_splits/chunk_{chunk}.esm_frameshift.vcf.gz", + final="{file}_splits/chunk_{chunk}.esm.vcf.gz", log: - "{file}.annotate_esm.log", + "{file}.chunk_{chunk}.annotate_esm.log", + resources: + cpu_load=int(config['esm_cpu_load']), + gpu_load=int(config['esm_gpu_load']), + threads: + config['esm_cpu_threads'], params: cadd=os.environ["CADD"], models=["--model %s " % model for model in config["ESMmodels"]], batch_size=config["ESMbatchsize"], + #header=config["Header"], + shell: """ model_directory=`dirname {input.models[0]}`; @@ -171,6 +305,8 @@ rule annotate_esm: --transcripts {input.transcripts} \ --model-directory $model_directory {params.models} \ --output {output.final} --batch-size {params.batch_size} &>> {log} + + #rm -f {wildcards.file}.esm_in.vcf.gz """ @@ -178,17 +314,20 @@ rule annotate_regseq: conda: "envs/regulatorySequence.yml" input: - vcf="{file}.esm.vcf.gz", + vcf="{file}_splits/chunk_{chunk}.esm.vcf.gz", reference="%s/%s" % (config["REGSEQpath"], "reference.fa"), genome="%s/%s" % (config["REGSEQpath"], "reference.fa.genome"), model="%s/%s" % (config["REGSEQpath"], "Hyperopt400InclNegatives.json"), weights="%s/%s" % (config["REGSEQpath"], "Hyperopt400InclNegatives.h5"), output: - temp("{file}.regseq.vcf.gz"), + "{file}_splits/chunk_{chunk}.regseq.vcf.gz", log: - "{file}.annotate_regseq.log", + "{file}.chunk_{chunk}.annotate_regseq.log", params: cadd=os.environ["CADD"], + resources: + # roughly 4GB of memory + cpu_load=int(config['regseq_cpu_load']), shell: """ python {params.cadd}/src/scripts/lib/tools/regulatorySequence/predictVariants.py \ @@ -205,24 +344,45 @@ rule annotate_mmsplice: conda: "envs/mmsplice.yml" input: - vcf="{file}.regseq.vcf.gz", + vcf="{file}_splits/chunk_{chunk}.regseq.vcf.gz", transcripts="%s/homo_sapiens.110.gtf" % config.get("MMSPLICEpath", ""), reference="%s/reference.fa" % config.get("REFERENCEpath", ""), output: - mmsplice=temp("{file}.mmsplice.vcf.gz"), - idx=temp("{file}.regseq.vcf.gz.tbi"), + mmsplice="{file}_splits/chunk_{chunk}.mmsplice.vcf.gz", + # needed ? + idx="{file}_splits/chunk_{chunk}.regseq.vcf.gz.tbi", log: - "{file}.annotate_mmsplice.log", + "{file}.chunk_{chunk}.annotate_mmsplice.log", params: cadd=os.environ["CADD"], + mms_threads=config['mms_cpu_threads'] # Assigning the number of threads for mmsplice + resources: + cpu_load=int(config['mms_cpu_load']), + threads: + config['mms_cpu_threads'], shell: """ - tabix -p vcf {input.vcf} &> {log}; - KERAS_BACKEND=tensorflow python {params.cadd}/src/scripts/lib/tools/MMSplice.py -i {input.vcf} \ - -g {input.transcripts} \ - -f {input.reference} | \ - grep -v '^Variant(CHROM=' | \ - bgzip -c > {output.mmsplice} 2>> {log} + # mmsplice crashes on empty vcf files: evaluate nr of variants left + LC=$(bgzip -dc {input.vcf} | grep -c -v '#' || true) + bgzip -dc {input.vcf} > /dev/null + echo "nr of lines: " + if [[ "$LC" -eq 0 ]]; then + echo "Empty VCF file, skipping mmsplice annotation." >> {log} + tabix -p vcf {input.vcf} &>> {log}; + cp {input.vcf} {output.mmsplice} + else + # set parallelism for tensorflow : + export OMP_NUM_THREADS={params.mms_threads} + export TF_NUM_INTRAOP_THREADS={params.mms_threads} + export TF_NUM_INTEROP_THREADS={params.mms_threads} + # annotate + tabix -p vcf {input.vcf} &> {log}; + KERAS_BACKEND=tensorflow python {params.cadd}/src/scripts/lib/tools/MMSplice.py -i {input.vcf} \ + -g {input.transcripts} \ + -f {input.reference} | \ + grep -v '^Variant(CHROM=' | \ + bgzip -c > {output.mmsplice} 2>> {log} + fi """ @@ -230,15 +390,17 @@ rule annotation: conda: "envs/environment_minimal.yml" input: - vcf=lambda wc: "{file}.%s.vcf.gz" + vcf=lambda wc: "{file}_splits/chunk_{chunk}.%s.vcf.gz" % ("mmsplice" if config["GenomeBuild"] == "GRCh38" else "regseq"), reference_cfg="%s/%s" % (os.environ["CADD"], config["ReferenceConfig"]), output: - temp("{file}.anno.tsv.gz"), + "{file}_splits/chunk_{chunk}.anno.tsv.gz", log: - "{file}.annotation.log", + "{file}.chunk_{chunk}.annotation.log", params: cadd=os.environ["CADD"], + resources: + cpu_load=int(config['anno_cpu_load']), shell: """ zcat {input.vcf} \ @@ -252,14 +414,16 @@ rule imputation: conda: "envs/environment_minimal.yml" input: - tsv="{file}.anno.tsv.gz", + tsv="{file}_splits/chunk_{chunk}.anno.tsv.gz", impute_cfg="%s/%s" % (os.environ["CADD"], config["ImputeConfig"]), output: - temp("{file}.csv.gz"), + "{file}_splits/chunk_{chunk}.csv.gz", log: - "{file}.imputation.log", + "{file}.chunk_{chunk}.imputation.log", params: cadd=os.environ["CADD"], + resources: + cpu_load=int(config['impute_cpu_load']), shell: """ zcat {input.tsv} \ @@ -272,18 +436,20 @@ rule score: conda: "envs/environment_minimal.yml" input: - impute="{file}.csv.gz", - anno="{file}.anno.tsv.gz", + impute="{file}_splits/chunk_{chunk}.csv.gz", + anno="{file}_splits/chunk_{chunk}.anno.tsv.gz", conversion_table="%s/%s" % (os.environ["CADD"], config["ConversionTable"]), model_file="%s/%s" % (os.environ["CADD"], config["Model"]), output: - temp("{file}.novel.tsv"), + "{file}_splits/chunk_{chunk}.novel.tsv", log: - "{file}.score.log", + "{file}.chunk_{chunk}.score.log", params: cadd=os.environ["CADD"], use_anno=config["Annotation"], columns=config["Columns"], + resources: + cpu_load=config['score_cpu_load'], shell: """ python {params.cadd}/src/scripts/predictSKmodel.py \ @@ -300,21 +466,39 @@ rule score: """ -def aggregate_input(wildcards): - with checkpoints.prescore.get(file=wildcards.file).output["novel"].open() as f: - output = ["{file}.pre.tsv"] - for line in f: - if line.strip() != "": - output.append("{file}.novel.tsv") - break - return output +# def aggregate_input(wildcards): +# # Find all chunk files for the given wildcard +# chunk_files = glob.glob(f"{wildcards.file}_splits/{wildcards.file}.chunk_*.novel.vcf") +# pre_files = glob.glob(f"{wildcards.file}_splits/{wildcards.file}.chunk_*.pre.tsv") +# +# # Combine the novel and prescore chunk files if not empty +# output = [f for f in chunk_files + pre_files if os.path.getsize(f) > 0] +# if not output: +# # no output : make empty file +# open(f"{wildcards.file}.empty", "w").close() +# output = [f"{wildcards.file}.empty"] +# +# return output + + + +#def aggregate_input(wildcards): +# with checkpoints.prescore.get(file=wildcards.file).output["novel"].open() as f: +# output = ["{file}.pre.tsv"] +# for line in f: +# if line.strip() != "": +# output.append("{file}.novel.tsv") +# break +# return output rule join: conda: "envs/environment_minimal.yml" input: - aggregate_input, + #aggregate_input, + pre=gather.split("{{file}}_splits/chunk_{scatteritem}.pre.tsv"), + scored=gather.split("{{file}}_splits/chunk_{scatteritem}.novel.tsv"), output: "{file,.+(? {output} 2>> {log}; diff --git a/envs/esm.yml b/envs/esm.yml index a364f54..ad6ea64 100644 --- a/envs/esm.yml +++ b/envs/esm.yml @@ -6,7 +6,7 @@ channels: - conda-forge - bioconda dependencies: - - numpy + - numpy<2 - python - click - biopython diff --git a/schemas/config_schema.yaml b/schemas/config_schema.yaml index 3f77190..b89bd27 100644 --- a/schemas/config_schema.yaml +++ b/schemas/config_schema.yaml @@ -77,6 +77,10 @@ properties: Columns: type: string pattern: "^[0-9]+(-[0-9]+)?(,[0-9]+(-[0-9]+)?)*$" + mem_gb: + type: integer + minimum: 0 + description: Available memory in GB, to scale max thread counts additionalProperties: false diff --git a/src/scripts/lib/tools/esmScore/esmScore_frameshift_av.py b/src/scripts/lib/tools/esmScore/esmScore_frameshift_av.py index f6ba8bf..b46a6e2 100644 --- a/src/scripts/lib/tools/esmScore/esmScore_frameshift_av.py +++ b/src/scripts/lib/tools/esmScore/esmScore_frameshift_av.py @@ -368,6 +368,12 @@ def cli( data_alt.append((transcript_id[i], aa_seq_alt[i][-window:])) ref_alt_scores = [] + # preloading reduces disk I/O time and bottlenecks + preloaded_models = list() + for k in range(0,len(modelsToUse),1): + # print("Preloading model {}".format(k)) + model, alphabet = pretrained.load_model_and_alphabet(modelsToUse[k]) + preloaded_models.append([model,alphabet]) # load esm model(s) for o in range(0, len([data_ref, data_alt]), 1): data = [data_ref, data_alt][o] @@ -375,7 +381,8 @@ def cli( if len(data) >= 1: for k in range(0, len(modelsToUse), 1): torch.cuda.empty_cache() - model, alphabet = pretrained.load_model_and_alphabet(modelsToUse[k]) + # print("fetch model {} from preloaded models".format(k)) + model, alphabet = preloaded_models[k] # pretrained.load_model_and_alphabet(modelsToUse[k]) model.eval() # disables dropout for deterministic results batch_converter = alphabet.get_batch_converter() @@ -386,6 +393,7 @@ def cli( # apply es model to sequence, tokenProbs hat probs von allen aa an jeder pos basierend auf der seq in "data" seq_scores = [] for t in range(0, len(data), batch_size): + # print("modelling allele {}/2 : model {}/{} : {}/{}".format(o+1,k+1,len(modelsToUse),t+1,len(data))) if t + batch_size > len(data): batch_data = data[t:] else: @@ -423,9 +431,9 @@ def cli( else: # calc mean of all possible aa at this position aa_scores = [] - for k in range(4, 24, 1): + for l in range(4, 24, 1): aa_scores.append( - token_probs[i, y + 1, k] + token_probs[i, y + 1, l] ) # for all aa (except selenocystein) aa_scores.append( token_probs[i, y + 1, 26] diff --git a/src/scripts/lib/tools/esmScore/esmScore_inFrame_av.py b/src/scripts/lib/tools/esmScore/esmScore_inFrame_av.py index a8ca043..8a1b0af 100644 --- a/src/scripts/lib/tools/esmScore/esmScore_inFrame_av.py +++ b/src/scripts/lib/tools/esmScore/esmScore_inFrame_av.py @@ -464,6 +464,11 @@ def cli(input_file, transcript_file, model_directory, modelsToUse, output_file, data_alt.append((transcript_id[i], aa_seq_alt[i][-window:])) ref_alt_scores = [] + preloaded_models = list() + for k in range(0,len(modelsToUse),1): + # print("Preloading model {}".format(k)) + model, alphabet = pretrained.load_model_and_alphabet(modelsToUse[k]) + preloaded_models.append([model,alphabet]) # load esm model(s) for o in range(0, len([data_ref, data_alt]), 1): data = [data_ref, data_alt][o] @@ -471,7 +476,8 @@ def cli(input_file, transcript_file, model_directory, modelsToUse, output_file, if len(data) >= 1: for k in range(0, len(modelsToUse), 1): torch.cuda.empty_cache() - model, alphabet = pretrained.load_model_and_alphabet(modelsToUse[k]) + # print("fetch model {} from preloaded models".format(k)) + model, alphabet = preloaded_models[k] # pretrained.load_model_and_alphabet(modelsToUse[k]) model.eval() # disables dropout for deterministic results batch_converter = alphabet.get_batch_converter()