From 8e9fe02ad0ca8ea178c8f9fd3d6e86ab5b9cef3b Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 2 Nov 2023 08:04:06 +0100 Subject: [PATCH 01/31] Add gitignore --- .gitignore | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ee726bf4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,264 @@ +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix From 156c79e6c144580cc24f156a7a330d1058cdcc41 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 2 Nov 2023 08:10:03 +0100 Subject: [PATCH 02/31] custom script to run outrider, not working yet. --- Outrider/1.20.0/outrider.R | 95 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 Outrider/1.20.0/outrider.R diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R new file mode 100644 index 00000000..f249fda5 --- /dev/null +++ b/Outrider/1.20.0/outrider.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript + +# Import statements, alphabetic order of main package. +suppressPackageStartupMessages(library("argparse")) +suppressPackageStartupMessages(library("OUTRIDER")) +suppressPackageStartupMessages(library("tools")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) + +# Argument parser +parser <- ArgumentParser(description = "Process some integers") +parser$add_argument("query", metavar = "query_input_files", nargs = "+", + help = "Files or directories containing the query input files.") +parser$add_argument("output_path", metavar = "output_path", nargs = "+", + help = "Path where output of OUTRIDER will be stored.") +parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", + help = "Files or directories containing the reference input files.") +args <- parser$parse_args() + + +read_input_files <- function(input){ + sampleIDs <- c() + count_tables <- lapply(input, function(f) { + input_ext <- tools::file_ext(f) + if(input_ext == "Rds") { + rse <- readRDS(f) # RangedSummarizedExperiment + sampleIDs <- append(sampleIDs, colnames(rse)) + return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) + } else if (input_ext %in% c("txt", "tsv")) { + count_table <- read_delim(f, show_col_types=FALSE) + sampleIDs <- append(sampleIDs, colnames(count_table)) + return(count_table) + } else { + stop("Input file extension is not supported.") + } + }) + return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) +} + + +get_input <- function(input){ + # If directories are provided + if(all(sapply(input, function(x) dir.exists(x)))) { + retrieved_files <- sapply(input, function(d){ + list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) + }) + } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. + retrieved_files <- input + } else { # If both dir and files are provided, or different tyes (character, int etc) + stop("Input is neither dir or file.") + } + + return(read_input_files(retrieved_files)) +} + + +merge_count_tables <- function(lst_query, lst_ref){ + # merge count tables together. + lst_count_tables <- c(lst_query, lst_ref) + all_counts <- lst_count_tables %>% + Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) +} + + +run_outrider <- function(all_counts) { + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + # filter expression + ods <- OutriderDataSet(all_counts) + filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) + out <- OUTRIDER(filtered) + return(out) +} + + +save_output <- function(out_path, out_outrider, ref_samples, padj_thres=0.05, zscore_thres=0, all=True) { + res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=all)) + # Reference samples are excluded from final results. + query_res <- filter(res, !(sampleID %in% ref_samples)) + # Write output table with aberrant expressed targets. + write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) +} + + +# TODO: investigate memory usage and if possible reduced / parallel. +main <- function(query, ref, output_path){ + query_data <- get_input(query) + ref_data <- get_input(ref) + all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) + output <- run_outrider(all_counts) + save_output(output_path, output, ref_data$sampleIDs) +} + + +main(args$query, args$ref, args$output_path) \ No newline at end of file From 369cf704225a508e927080954e0cfd9d3e0b5e67 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Thu, 2 Nov 2023 08:10:17 +0100 Subject: [PATCH 03/31] Dockerfile outrider for additional R packages --- Outrider/1.20.0/Dockerfile | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 Outrider/1.20.0/Dockerfile diff --git a/Outrider/1.20.0/Dockerfile b/Outrider/1.20.0/Dockerfile new file mode 100644 index 00000000..d4510780 --- /dev/null +++ b/Outrider/1.20.0/Dockerfile @@ -0,0 +1,19 @@ +################## BASE IMAGE ###################### +FROM --platform=linux/amd64 quay.io/biocontainers/bioconductor-outrider:1.18.1--r43hf17093f_0 + +################## METADATA ###################### +LABEL base_image="outrider:1.18.1--r43hf17093f_0" +LABEL version="1.18.1--r43hf17093f_0" +LABEL software="dependencies_custom_outrider" +LABEL about.summary="" +LABEL about.home="https://github.com/gagneurlab/OUTRIDER" +LABEL about.documentation="https://github.com/gagneurlab/OUTRIDER" +# LABEL extra.binaries="R" + +################## INSTALLATIONS ###################### + +RUN R -e "install.packages('argparse', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('tools', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('tibble', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('readr', dependencies=TRUE, repos='http://cran.rstudio.com/')" +RUN R -e "install.packages('dplyr', dependencies=TRUE, repos='http://cran.rstudio.com/')" From 0e46a46a6d3fab81ee413eb2e38ec9bc198ca330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Clonnekevanbrussel=E2=80=9D?= <“lonneke.vanbrussel@uvic.cat”> Date: Fri, 5 Jan 2024 11:25:33 +0100 Subject: [PATCH 04/31] changes NextflowModules to test and run pipeline --- Outrider/1.20.0/outrider.R | 110 +++++++++++++------------- Outrider/1.20.0/outrider_050123.R | 95 +++++++++++++++++++++++ main.nf | 123 ++++++++++++++++++++++++++++++ modules.json | 77 +++++++++++++++++++ nextflow_schema.json | 97 +++++++++++++++++++++++ run_nextflow.sh | 58 ++++++++++++++ 6 files changed, 508 insertions(+), 52 deletions(-) create mode 100644 Outrider/1.20.0/outrider_050123.R create mode 100644 main.nf create mode 100644 modules.json create mode 100644 nextflow_schema.json create mode 100644 run_nextflow.sh diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index f249fda5..2cd08598 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -11,85 +11,91 @@ suppressPackageStartupMessages(library("dplyr")) # Argument parser parser <- ArgumentParser(description = "Process some integers") parser$add_argument("query", metavar = "query_input_files", nargs = "+", - help = "Files or directories containing the query input files.") + help = "Files or directories containing the query input files.") parser$add_argument("output_path", metavar = "output_path", nargs = "+", - help = "Path where output of OUTRIDER will be stored.") + help = "Path where output of OUTRIDER will be stored.") parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", - help = "Files or directories containing the reference input files.") + help = "Files or directories containing the reference input files.") args <- parser$parse_args() read_input_files <- function(input){ - sampleIDs <- c() - count_tables <- lapply(input, function(f) { - input_ext <- tools::file_ext(f) - if(input_ext == "Rds") { - rse <- readRDS(f) # RangedSummarizedExperiment - sampleIDs <- append(sampleIDs, colnames(rse)) - return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) - } else if (input_ext %in% c("txt", "tsv")) { - count_table <- read_delim(f, show_col_types=FALSE) - sampleIDs <- append(sampleIDs, colnames(count_table)) - return(count_table) - } else { - stop("Input file extension is not supported.") - } - }) - return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) + sampleIDs <- c() + count_tables <- lapply(input, function(f) { + input_ext <- tools::file_ext(f) + if(input_ext == "Rds") { + rse <- readRDS(f) # RangedSummarizedExperiment + sampleIDs <- append(sampleIDs, colnames(rse)) + return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) + } else if (input_ext %in% c("txt", "tsv")) { + count_table <- read_delim(f, show_col_types=FALSE, skip=1) + ct <- count_table[,c(1,7)] + names(ct)[1] <- "rownames" + sampleIDs <- append(sampleIDs, colnames(count_table)) + return(ct) + } else { + stop("Input file extension is not supported.") + } + }) + return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) } get_input <- function(input){ - # If directories are provided - if(all(sapply(input, function(x) dir.exists(x)))) { - retrieved_files <- sapply(input, function(d){ - list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) - }) - } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. - retrieved_files <- input - } else { # If both dir and files are provided, or different tyes (character, int etc) - stop("Input is neither dir or file.") - } - - return(read_input_files(retrieved_files)) + # If directories are provided + if(all(sapply(input, function(x) dir.exists(x)))) { + retrieved_files <- sapply(input, function(d){ + list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) + }) + } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. + retrieved_files <- input + } else { # If both dir and files are provided, or different tyes (character, int etc) + stop("Input is neither dir or file.") + } + + return(read_input_files(retrieved_files)) } merge_count_tables <- function(lst_query, lst_ref){ - # merge count tables together. - lst_count_tables <- c(lst_query, lst_ref) - all_counts <- lst_count_tables %>% - Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) + # merge count tables together. + lst_count_tables <- c(lst_query, lst_ref) + all_counts <- lst_count_tables %>% + Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) } run_outrider <- function(all_counts) { - # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. - # filter expression - ods <- OutriderDataSet(all_counts) - filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) - out <- OUTRIDER(filtered) - return(out) + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + # filter expression + all_counts_matrix <- as.matrix(all_counts)[,-1] + mode(all_counts_matrix) <- "integer" + rownames(all_counts_matrix) <- all_counts$rownames + + ods <- OutriderDataSet(countData=all_counts_matrix) + filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) + out <- OUTRIDER(filtered) + return(out) } save_output <- function(out_path, out_outrider, ref_samples, padj_thres=0.05, zscore_thres=0, all=True) { - res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=all)) - # Reference samples are excluded from final results. - query_res <- filter(res, !(sampleID %in% ref_samples)) - # Write output table with aberrant expressed targets. - write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) +# res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) + res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) + # Reference samples are excluded from final results. + query_res <- filter(res, !(sampleID %in% ref_samples)) + # Write output table with aberrant expressed targets. + write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) } # TODO: investigate memory usage and if possible reduced / parallel. main <- function(query, ref, output_path){ - query_data <- get_input(query) - ref_data <- get_input(ref) - all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) - output <- run_outrider(all_counts) - save_output(output_path, output, ref_data$sampleIDs) + query_data <- get_input(query) + ref_data <- get_input(ref) + all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) + output <- run_outrider(all_counts) + save_output(output_path, output, ref_data$sampleIDs) } - main(args$query, args$ref, args$output_path) \ No newline at end of file diff --git a/Outrider/1.20.0/outrider_050123.R b/Outrider/1.20.0/outrider_050123.R new file mode 100644 index 00000000..f249fda5 --- /dev/null +++ b/Outrider/1.20.0/outrider_050123.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript + +# Import statements, alphabetic order of main package. +suppressPackageStartupMessages(library("argparse")) +suppressPackageStartupMessages(library("OUTRIDER")) +suppressPackageStartupMessages(library("tools")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("dplyr")) + +# Argument parser +parser <- ArgumentParser(description = "Process some integers") +parser$add_argument("query", metavar = "query_input_files", nargs = "+", + help = "Files or directories containing the query input files.") +parser$add_argument("output_path", metavar = "output_path", nargs = "+", + help = "Path where output of OUTRIDER will be stored.") +parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", + help = "Files or directories containing the reference input files.") +args <- parser$parse_args() + + +read_input_files <- function(input){ + sampleIDs <- c() + count_tables <- lapply(input, function(f) { + input_ext <- tools::file_ext(f) + if(input_ext == "Rds") { + rse <- readRDS(f) # RangedSummarizedExperiment + sampleIDs <- append(sampleIDs, colnames(rse)) + return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) + } else if (input_ext %in% c("txt", "tsv")) { + count_table <- read_delim(f, show_col_types=FALSE) + sampleIDs <- append(sampleIDs, colnames(count_table)) + return(count_table) + } else { + stop("Input file extension is not supported.") + } + }) + return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) +} + + +get_input <- function(input){ + # If directories are provided + if(all(sapply(input, function(x) dir.exists(x)))) { + retrieved_files <- sapply(input, function(d){ + list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) + }) + } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. + retrieved_files <- input + } else { # If both dir and files are provided, or different tyes (character, int etc) + stop("Input is neither dir or file.") + } + + return(read_input_files(retrieved_files)) +} + + +merge_count_tables <- function(lst_query, lst_ref){ + # merge count tables together. + lst_count_tables <- c(lst_query, lst_ref) + all_counts <- lst_count_tables %>% + Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) +} + + +run_outrider <- function(all_counts) { + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + # filter expression + ods <- OutriderDataSet(all_counts) + filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) + out <- OUTRIDER(filtered) + return(out) +} + + +save_output <- function(out_path, out_outrider, ref_samples, padj_thres=0.05, zscore_thres=0, all=True) { + res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=all)) + # Reference samples are excluded from final results. + query_res <- filter(res, !(sampleID %in% ref_samples)) + # Write output table with aberrant expressed targets. + write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) +} + + +# TODO: investigate memory usage and if possible reduced / parallel. +main <- function(query, ref, output_path){ + query_data <- get_input(query) + ref_data <- get_input(ref) + all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) + output <- run_outrider(all_counts) + save_output(output_path, output, ref_data$sampleIDs) +} + + +main(args$query, args$ref, args$output_path) \ No newline at end of file diff --git a/main.nf b/main.nf new file mode 100644 index 00000000..77b517dc --- /dev/null +++ b/main.nf @@ -0,0 +1,123 @@ +#!/usr/bin/env nextflow +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + UMCUGenetics/DxNextflowRNA +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Github : https://github.com/UMCUGenetics/DxNextflowRNA +---------------------------------------------------------------------------------------- +*/ + +nextflow.enable.dsl = 2 + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Validate parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { validateParameters; } from 'plugin/nf-validation' +validateParameters() + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Import modules/subworkflows +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +/*include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS } from './subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main' + +include { TRIMGALORE } from './modules/nf-core/trimgalore/main' +include { FASTQC } from './modules/nf-core/fastqc/main' +include { MULTIQC } from './modules/nf-core/multiqc/main' +include { SAMTOOLS_INDEX } from './modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE } from './modules/nf-core/samtools/merge/main' +include { STAR_ALIGN } from './modules/nf-core/star/align/main' +include { SUBREAD_FEATURECOUNTS } from './modules/nf-core/subread/featurecounts/main' + +*/ +include { OUTRIDER } from './Outrider/1.20.0/main' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Main workflow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow { + // Reference file channels +/* ch_star_index = Channel.fromPath(params.star_index).map {star_index -> [star_index.getSimpleName(), star_index] } + ch_gtf = Channel.fromPath(params.gtf).map { gtf -> [gtf.getSimpleName(), gtf] } + + // Input channel + ch_fastq = Channel.fromFilePairs("$params.input/*_R{1,2}_001.fastq.gz") + .map { + meta, fastq -> + def fmeta = [:] + // Set meta.id + fmeta.id = meta + // Set meta.single_end + if (fastq.size() == 1) { + fmeta.single_end = true + } else { + fmeta.single_end = false + } + [ fmeta, fastq ] + } + + // Trim, Alignment, FeatureCounts + TRIMGALORE( + ch_fastq + ) + + STAR_ALIGN( + TRIMGALORE.out.reads, + ch_star_index.first(), + ch_gtf.first(), + false, + 'illumina', + 'UMCU Genetics' + ) + + SAMTOOLS_MERGE( + STAR_ALIGN.out.bam_sorted.map { + meta, bam -> + new_id = meta.id.split('_')[0] + [ meta + [id: new_id], bam ] + }.groupTuple(), + [ [ id:'null' ], []], + [ [ id:'null' ], []], + ) + + SAMTOOLS_INDEX ( SAMTOOLS_MERGE.out.bam ) + + SAMTOOLS_MERGE.out.bam + .join(SAMTOOLS_INDEX.out.bai) + .set { ch_bam_bai } + + BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS( + ch_bam_bai, + true + ) + + SUBREAD_FEATURECOUNTS( + BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS.out.bam.map{ + meta, bam -> [ meta, bam, params.gtf ] + } + ) + + // QC + FASTQC(ch_fastq) + + // MultiQC + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + MULTIQC( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + Channel.empty().toList(), + Channel.empty().toList() + ) +*/ + // Input channel + ch_outrider = Channel.fromPath("$params.input/featureCounts/*_.txt") + //Outrider +} \ No newline at end of file diff --git a/modules.json b/modules.json new file mode 100644 index 00000000..208f19bd --- /dev/null +++ b/modules.json @@ -0,0 +1,77 @@ +{ + "name": "UMCUGenetics/DxNextflowRNA", + "homePage": "https://github.com/UMCUGenetics/DxNextflowRNA", + "repos": { + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "fastqc": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, + "multiqc": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["bam_dedup_stats_samtools_umitools"] + }, + "samtools/merge": { + "branch": "master", + "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", + "installed_by": ["modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["bam_stats_samtools"] + }, + "star/align": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, + "subread/featurecounts": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, + "umitools/dedup": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["bam_dedup_stats_samtools_umitools"] + } + } + }, + "subworkflows": { + "nf-core": { + "bam_dedup_stats_samtools_umitools": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["subworkflows"] + }, + "bam_stats_samtools": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["bam_dedup_stats_samtools_umitools"] + } + } + } + } + } + } + \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 00000000..c5db6a24 --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,97 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/UMCUGenetics/DxNextflowRNA/master/nextflow_schema.json", + "title": "UMCUGenetics/DxNextflowRNA pipeline parameters", + "description": "UMCU Genetics RNA Workflow", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "directory-path", + "exists": true, + "description": "Path to input data.", + "fa_icon": "fas fa-folder-open" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved.", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + } + } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Path of genome reference.", + "fa_icon": "fas fa-book", + "help_text": "" + }, + "gtf": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.gtf(\\.gz)?$", + "description": "Path to GTF annotation file.", + "fa_icon": "fas fa-code-branch", + "help_text": "This parameter is *mandatory* if `--genome` is not specified." + }, + "star_index": { + "type": "string", + "format": "path", + "exists": true, + "fa_icon": "fas fa-bezier-curve", + "description": "Path to directory or tar.gz archive for pre-built STAR index." + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "properties": { + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "hidden": true + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/generic_options" + } + ] + } + \ No newline at end of file diff --git a/run_nextflow.sh b/run_nextflow.sh new file mode 100644 index 00000000..c0aafaa7 --- /dev/null +++ b/run_nextflow.sh @@ -0,0 +1,58 @@ +#!/bin/bash +set -euo pipefail + +workflow_path='/hpc/diaggen/users/lonneke/github/DxNextflowRNA' + +# Set input and output dirs +input=`realpath $1` +output=`realpath $2` +email=$3 + +mkdir -p $output && cd $output +mkdir -p log + +if ! { [ -f 'workflow.running' ] || [ -f 'workflow.done' ] || [ -f 'workflow.failed' ]; }; then +touch workflow.running +echo "check directory for output: ${output}" + +export JAVA_HOME='/hpc/diaggen/software/tools/jdk-18.0.2.1/' # change java version +export NXF_JAVA_HOME='/hpc/diaggen/software/tools/jdk-18.0.2.1/' # change java vesion of nextflow + +sbatch < Date: Fri, 5 Jan 2024 11:45:45 +0100 Subject: [PATCH 05/31] changed links to NextflowModules --- run_nextflow.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_nextflow.sh b/run_nextflow.sh index c0aafaa7..69ad309f 100644 --- a/run_nextflow.sh +++ b/run_nextflow.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -workflow_path='/hpc/diaggen/users/lonneke/github/DxNextflowRNA' +workflow_path='/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules' # Set input and output dirs input=`realpath $1` @@ -31,8 +31,8 @@ sbatch < Date: Fri, 5 Jan 2024 12:03:22 +0100 Subject: [PATCH 06/31] added nextflow.config --- nextflow.config | 101 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 nextflow.config diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 00000000..7952bf9d --- /dev/null +++ b/nextflow.config @@ -0,0 +1,101 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + UMCUGenetics/DxNextflowRNA Nextflow config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +manifest { + name = 'UMCUGenetics/DxNextflowRNA' + author = """UMCU Genetics""" + homePage = 'https://github.com/UMCUGenetics/DxNextflowRNA' + description = """UMCU Genetics RNA Workflow""" + mainScript = 'main.nf' + nextflowVersion = '!>=23.04.0' + version = '1.0dev' + doi = '' +} + +// Nextflow plugins +plugins { + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} + +// Global default params, used in configs +params { + genome = '/hpc/diaggen/data/databases/ref_genomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta' + star_index = '/hpc/diaggen/data/databases/STAR_ref_genome_index/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set_2.7.9a' + gtf = '/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf' + + // Input options + input = null + + // Output options + outdir = null + publish_dir_mode = 'link' + email = null + + // cluster_options = "--mail-user $params.email --mail-type FAIL --account=diaggen" +} + +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] + +// Load base.config +includeConfig 'conf/base.config' + +// Load modules.config for module specific options +includeConfig 'conf/modules.config' + +report { + enabled = true + file = "$params.outdir/log/nextflow_report.html" + overwrite = true +} + +trace { + enabled = true + file = "$params.outdir/log/nextflow_trace.txt" + overwrite = true +} + +timeline { + enabled = true + file = "$params.outdir/log/nextflow_timeline.html" + overwrite = true +} + +profiles { + + slurm { + process { + executor = 'slurm' + queue = 'cpu' + + errorStrategy = 'retry' + maxRetries = 1 + } + + singularity { + enabled = true + runOptions = '-B /hpc:/hpc -B $TMPDIR:$TMPDIR' + autoMounts = true + cacheDir = '/hpc/diaggen/software/singularity_cache' + } + + executor { + queueSize = 1000 + pollInterval = '1min' + queueStatInterval = '5min' + submitRatelimit = '10sec' + } + + mail { + smtp.host = 'localhost' + } + } + + mac { + docker.enabled = true + docker.runOptions = '-v /Users:/Users' + } +} \ No newline at end of file From 8918ddc5c48636b4b6125b379d7293af3282e892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Clonnekevanbrussel=E2=80=9D?= <“lonneke.vanbrussel@uvic.cat”> Date: Fri, 5 Jan 2024 12:17:39 +0100 Subject: [PATCH 07/31] added conf folder witth config files --- conf/base.config | 75 +++++++++++++++++++++++++++++++++++++++++++++ conf/modules.config | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 conf/base.config create mode 100644 conf/modules.config diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 00000000..32944612 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,75 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + UMCUGenetics/DxNextflowRNA Nextflow base config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + + // Defaults for all processes, based on HPC defaults + cpus = { 2 * task.attempt } + memory = { 10.GB * task.attempt } + time = { 1.h * task.attempt } + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + executor = 'slurm' + queue = 'cpu' + // clusterOptions = "$params.cluster_options" + + // Process-specific resource requirements + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { 2 } + memory = { 8.GB * task.attempt } + time = { 4.h * task.attempt } + } + withLabel:process_low { + cpus = { 2 * task.attempt } + memory = { 16.GB * task.attempt } + time = { 4.h * task.attempt } + } + withLabel:process_medium { + cpus = { 6 * task.attempt } + memory = { 32.GB * task.attempt } + time = { 8.h * task.attempt } + } + withLabel:process_high { + cpus = { 12 * task.attempt } + memory = { 64.GB * task.attempt } + time = { 16.h * task.attempt } + } + withLabel:process_long { + time = { 20.h * task.attempt } + } + withLabel:process_high_memory { + memory = { 128.GB * task.attempt } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } +} + +singularity { + enabled = true + runOptions = '-B /hpc:/hpc -B $TMPDIR:$TMPDIR' + autoMounts = true + cacheDir = '/hpc/diaggen/projects/woc/rna/container' +} + +executor { + queueSize = 1000 + pollInterval = '1min' + queueStatInterval = '5min' + submitRatelimit = '10sec' +} + +mail { + smtp.host = 'localhost' +} \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config new file mode 100644 index 00000000..81083f5b --- /dev/null +++ b/conf/modules.config @@ -0,0 +1,59 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + withName: FASTQC { + ext.args = '--quiet' + publishDir = [ + [path: "$params.outdir/QC/", mode: params.publish_dir_mode, pattern: '*_{fastqc.html}'] + ] + cpus = 2 + memory = { 8.GB * task.attempt } + time = { 15.m * task.attempt } + } + + withName: SAMTOOLS_MERGE { + cpus = { 8 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 1.h * task.attempt } + } + + withName: STAR_ALIGN { + cpus = { 12 * task.attempt } + memory = { 40.GB * task.attempt } + time = { 1.h * task.attempt } + ext.args = '--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate' + } + + withName: MULTIQC { + publishDir = [ + path: { "${params.outdir}/QC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: UMITOOLS_DEDUP { + ext.args = "--umi-separator=:" + ext.prefix = { "${meta.id}.markdup" } + publishDir = [ + [path: "$params.outdir/bam_files/", mode: params.publish_dir_mode, pattern: '*.ba*'] + ] + clusterOptions = "--gres=tmpspace:50G" + } + + withName: SUBREAD_FEATURECOUNTS { + publishDir = [ + [path: "$params.outdir/feature_counts/", mode: params.publish_dir_mode, pattern: '*featureCounts.txt*'], + ] + } +} \ No newline at end of file From 5a9e2283415b5fd50ad964ee1d4a752014669f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Clonnekevanbrussel=E2=80=9D?= <“lonneke.vanbrussel@uvic.cat”> Date: Fri, 5 Jan 2024 12:51:57 +0100 Subject: [PATCH 08/31] added main.nf and yml in outrider --- Outrider/1.20.0/environment.yml | 6 ++++++ Outrider/1.20.0/main.nf | 28 ++++++++++++++++++++++++++++ Outrider/1.20.0/meta.yml | 26 ++++++++++++++++++++++++++ main.nf | 8 +++++++- 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 Outrider/1.20.0/environment.yml create mode 100644 Outrider/1.20.0/main.nf create mode 100644 Outrider/1.20.0/meta.yml diff --git a/Outrider/1.20.0/environment.yml b/Outrider/1.20.0/environment.yml new file mode 100644 index 00000000..f52a53a0 --- /dev/null +++ b/Outrider/1.20.0/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf new file mode 100644 index 00000000..a33ca637 --- /dev/null +++ b/Outrider/1.20.0/main.nf @@ -0,0 +1,28 @@ + +process OUTRIDER { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" +/* container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }"*/ + + input: + tuple val(meta), path(counts) + + output: + tuple val(meta), path("*.tsv"), emit: tsv +// path "versions.yml" , emit: versions + + /* when: + task.ext.when == null || task.ext.when*/ + + script: + """ + printf "%s %s\\n" $rename_to | while read old_name new_name; do + [ -f "\${new_name}" ] || ln -s \$old_name \$new_name + done + """ + +} \ No newline at end of file diff --git a/Outrider/1.20.0/meta.yml b/Outrider/1.20.0/meta.yml new file mode 100644 index 00000000..54155e70 --- /dev/null +++ b/Outrider/1.20.0/meta.yml @@ -0,0 +1,26 @@ +name: outrider +description: Run Outrider on feature counts +keywords: + - outrider + - abberant expression +tools: + - outrider: + description: | + homepage: + documentation: + licence: +input: + - meta: + type: map + description: | + Groovy Map containing sample information +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] +authors: + - +maintainers: + - diff --git a/main.nf b/main.nf index 77b517dc..9c78544f 100644 --- a/main.nf +++ b/main.nf @@ -118,6 +118,12 @@ workflow { ) */ // Input channel - ch_outrider = Channel.fromPath("$params.input/featureCounts/*_.txt") + ch_outrider_in = Channel.fromPath("$params.input/feature_counts/*CHX*.txt") + ch_outrider_ref = Channel.fromPath("$params.input/feature_counts/*Cntrl*.txt") + //Outrider + OUTRIDER( + ch_outrider_in, + ch_outrider_ref + ) } \ No newline at end of file From 6dc82eb763a8aaf03921a2357d59fde6ac019ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Clonnekevanbrussel=E2=80=9D?= <“lonneke.vanbrussel@uvic.cat”> Date: Fri, 5 Jan 2024 13:47:46 +0100 Subject: [PATCH 09/31] edit outrider main.nf --- Outrider/1.20.0/main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index a33ca637..64a23ddd 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -4,12 +4,12 @@ process OUTRIDER { label 'process_medium' conda "${moduleDir}/environment.yml" -/* container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }"*/ + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-outrider:1.20.0--r43hf17093f_0 ' : + 'biocontainers/bioconductor-outrider:1.20.0--r43hf17093f_0 ' }" input: - tuple val(meta), path(counts) + tuple val(meta), path(counts), path(ref) output: tuple val(meta), path("*.tsv"), emit: tsv @@ -20,9 +20,9 @@ process OUTRIDER { script: """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done + echo $counts + echo $ref + Rscript "outrider.R" "$counts" "/out" -r "$ref" """ } \ No newline at end of file From 03407c48458ada50de95b82757073ea525f65983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Clonnekevanbrussel=E2=80=9D?= <“lonneke.vanbrussel@uvic.cat”> Date: Tue, 9 Jan 2024 16:31:27 +0100 Subject: [PATCH 10/31] Delete all test files --- Outrider/1.20.0/outrider_050123.R | 95 ---------------------- conf/base.config | 75 ----------------- conf/modules.config | 59 -------------- main.nf | 129 ------------------------------ modules.json | 77 ------------------ nextflow.config | 101 ----------------------- nextflow_schema.json | 97 ---------------------- run_nextflow.sh | 58 -------------- 8 files changed, 691 deletions(-) delete mode 100644 Outrider/1.20.0/outrider_050123.R delete mode 100644 conf/base.config delete mode 100644 conf/modules.config delete mode 100644 main.nf delete mode 100644 modules.json delete mode 100644 nextflow.config delete mode 100644 nextflow_schema.json delete mode 100644 run_nextflow.sh diff --git a/Outrider/1.20.0/outrider_050123.R b/Outrider/1.20.0/outrider_050123.R deleted file mode 100644 index f249fda5..00000000 --- a/Outrider/1.20.0/outrider_050123.R +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env Rscript - -# Import statements, alphabetic order of main package. -suppressPackageStartupMessages(library("argparse")) -suppressPackageStartupMessages(library("OUTRIDER")) -suppressPackageStartupMessages(library("tools")) -suppressPackageStartupMessages(library("tibble")) -suppressPackageStartupMessages(library("readr")) -suppressPackageStartupMessages(library("dplyr")) - -# Argument parser -parser <- ArgumentParser(description = "Process some integers") -parser$add_argument("query", metavar = "query_input_files", nargs = "+", - help = "Files or directories containing the query input files.") -parser$add_argument("output_path", metavar = "output_path", nargs = "+", - help = "Path where output of OUTRIDER will be stored.") -parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", - help = "Files or directories containing the reference input files.") -args <- parser$parse_args() - - -read_input_files <- function(input){ - sampleIDs <- c() - count_tables <- lapply(input, function(f) { - input_ext <- tools::file_ext(f) - if(input_ext == "Rds") { - rse <- readRDS(f) # RangedSummarizedExperiment - sampleIDs <- append(sampleIDs, colnames(rse)) - return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) - } else if (input_ext %in% c("txt", "tsv")) { - count_table <- read_delim(f, show_col_types=FALSE) - sampleIDs <- append(sampleIDs, colnames(count_table)) - return(count_table) - } else { - stop("Input file extension is not supported.") - } - }) - return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) -} - - -get_input <- function(input){ - # If directories are provided - if(all(sapply(input, function(x) dir.exists(x)))) { - retrieved_files <- sapply(input, function(d){ - list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) - }) - } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. - retrieved_files <- input - } else { # If both dir and files are provided, or different tyes (character, int etc) - stop("Input is neither dir or file.") - } - - return(read_input_files(retrieved_files)) -} - - -merge_count_tables <- function(lst_query, lst_ref){ - # merge count tables together. - lst_count_tables <- c(lst_query, lst_ref) - all_counts <- lst_count_tables %>% - Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) -} - - -run_outrider <- function(all_counts) { - # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. - # filter expression - ods <- OutriderDataSet(all_counts) - filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) - out <- OUTRIDER(filtered) - return(out) -} - - -save_output <- function(out_path, out_outrider, ref_samples, padj_thres=0.05, zscore_thres=0, all=True) { - res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=all)) - # Reference samples are excluded from final results. - query_res <- filter(res, !(sampleID %in% ref_samples)) - # Write output table with aberrant expressed targets. - write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) -} - - -# TODO: investigate memory usage and if possible reduced / parallel. -main <- function(query, ref, output_path){ - query_data <- get_input(query) - ref_data <- get_input(ref) - all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) - output <- run_outrider(all_counts) - save_output(output_path, output, ref_data$sampleIDs) -} - - -main(args$query, args$ref, args$output_path) \ No newline at end of file diff --git a/conf/base.config b/conf/base.config deleted file mode 100644 index 32944612..00000000 --- a/conf/base.config +++ /dev/null @@ -1,75 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - UMCUGenetics/DxNextflowRNA Nextflow base config file -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -process { - - // Defaults for all processes, based on HPC defaults - cpus = { 2 * task.attempt } - memory = { 10.GB * task.attempt } - time = { 1.h * task.attempt } - - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' - - executor = 'slurm' - queue = 'cpu' - // clusterOptions = "$params.cluster_options" - - // Process-specific resource requirements - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_single { - cpus = { 2 } - memory = { 8.GB * task.attempt } - time = { 4.h * task.attempt } - } - withLabel:process_low { - cpus = { 2 * task.attempt } - memory = { 16.GB * task.attempt } - time = { 4.h * task.attempt } - } - withLabel:process_medium { - cpus = { 6 * task.attempt } - memory = { 32.GB * task.attempt } - time = { 8.h * task.attempt } - } - withLabel:process_high { - cpus = { 12 * task.attempt } - memory = { 64.GB * task.attempt } - time = { 16.h * task.attempt } - } - withLabel:process_long { - time = { 20.h * task.attempt } - } - withLabel:process_high_memory { - memory = { 128.GB * task.attempt } - } - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 - } -} - -singularity { - enabled = true - runOptions = '-B /hpc:/hpc -B $TMPDIR:$TMPDIR' - autoMounts = true - cacheDir = '/hpc/diaggen/projects/woc/rna/container' -} - -executor { - queueSize = 1000 - pollInterval = '1min' - queueStatInterval = '5min' - submitRatelimit = '10sec' -} - -mail { - smtp.host = 'localhost' -} \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config deleted file mode 100644 index 81083f5b..00000000 --- a/conf/modules.config +++ /dev/null @@ -1,59 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. ----------------------------------------------------------------------------------------- -*/ - -process { - withName: FASTQC { - ext.args = '--quiet' - publishDir = [ - [path: "$params.outdir/QC/", mode: params.publish_dir_mode, pattern: '*_{fastqc.html}'] - ] - cpus = 2 - memory = { 8.GB * task.attempt } - time = { 15.m * task.attempt } - } - - withName: SAMTOOLS_MERGE { - cpus = { 8 * task.attempt } - memory = { 8.GB * task.attempt } - time = { 1.h * task.attempt } - } - - withName: STAR_ALIGN { - cpus = { 12 * task.attempt } - memory = { 40.GB * task.attempt } - time = { 1.h * task.attempt } - ext.args = '--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate' - } - - withName: MULTIQC { - publishDir = [ - path: { "${params.outdir}/QC" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: UMITOOLS_DEDUP { - ext.args = "--umi-separator=:" - ext.prefix = { "${meta.id}.markdup" } - publishDir = [ - [path: "$params.outdir/bam_files/", mode: params.publish_dir_mode, pattern: '*.ba*'] - ] - clusterOptions = "--gres=tmpspace:50G" - } - - withName: SUBREAD_FEATURECOUNTS { - publishDir = [ - [path: "$params.outdir/feature_counts/", mode: params.publish_dir_mode, pattern: '*featureCounts.txt*'], - ] - } -} \ No newline at end of file diff --git a/main.nf b/main.nf deleted file mode 100644 index 9c78544f..00000000 --- a/main.nf +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env nextflow -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - UMCUGenetics/DxNextflowRNA -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/UMCUGenetics/DxNextflowRNA ----------------------------------------------------------------------------------------- -*/ - -nextflow.enable.dsl = 2 - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Validate parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { validateParameters; } from 'plugin/nf-validation' -validateParameters() - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Import modules/subworkflows -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -/*include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS } from './subworkflows/nf-core/bam_dedup_stats_samtools_umitools/main' - -include { TRIMGALORE } from './modules/nf-core/trimgalore/main' -include { FASTQC } from './modules/nf-core/fastqc/main' -include { MULTIQC } from './modules/nf-core/multiqc/main' -include { SAMTOOLS_INDEX } from './modules/nf-core/samtools/index/main' -include { SAMTOOLS_MERGE } from './modules/nf-core/samtools/merge/main' -include { STAR_ALIGN } from './modules/nf-core/star/align/main' -include { SUBREAD_FEATURECOUNTS } from './modules/nf-core/subread/featurecounts/main' - -*/ -include { OUTRIDER } from './Outrider/1.20.0/main' -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Main workflow -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow { - // Reference file channels -/* ch_star_index = Channel.fromPath(params.star_index).map {star_index -> [star_index.getSimpleName(), star_index] } - ch_gtf = Channel.fromPath(params.gtf).map { gtf -> [gtf.getSimpleName(), gtf] } - - // Input channel - ch_fastq = Channel.fromFilePairs("$params.input/*_R{1,2}_001.fastq.gz") - .map { - meta, fastq -> - def fmeta = [:] - // Set meta.id - fmeta.id = meta - // Set meta.single_end - if (fastq.size() == 1) { - fmeta.single_end = true - } else { - fmeta.single_end = false - } - [ fmeta, fastq ] - } - - // Trim, Alignment, FeatureCounts - TRIMGALORE( - ch_fastq - ) - - STAR_ALIGN( - TRIMGALORE.out.reads, - ch_star_index.first(), - ch_gtf.first(), - false, - 'illumina', - 'UMCU Genetics' - ) - - SAMTOOLS_MERGE( - STAR_ALIGN.out.bam_sorted.map { - meta, bam -> - new_id = meta.id.split('_')[0] - [ meta + [id: new_id], bam ] - }.groupTuple(), - [ [ id:'null' ], []], - [ [ id:'null' ], []], - ) - - SAMTOOLS_INDEX ( SAMTOOLS_MERGE.out.bam ) - - SAMTOOLS_MERGE.out.bam - .join(SAMTOOLS_INDEX.out.bai) - .set { ch_bam_bai } - - BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS( - ch_bam_bai, - true - ) - - SUBREAD_FEATURECOUNTS( - BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS.out.bam.map{ - meta, bam -> [ meta, bam, params.gtf ] - } - ) - - // QC - FASTQC(ch_fastq) - - // MultiQC - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - MULTIQC( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - Channel.empty().toList(), - Channel.empty().toList() - ) -*/ - // Input channel - ch_outrider_in = Channel.fromPath("$params.input/feature_counts/*CHX*.txt") - ch_outrider_ref = Channel.fromPath("$params.input/feature_counts/*Cntrl*.txt") - - //Outrider - OUTRIDER( - ch_outrider_in, - ch_outrider_ref - ) -} \ No newline at end of file diff --git a/modules.json b/modules.json deleted file mode 100644 index 208f19bd..00000000 --- a/modules.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "name": "UMCUGenetics/DxNextflowRNA", - "homePage": "https://github.com/UMCUGenetics/DxNextflowRNA", - "repos": { - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "fastqc": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] - }, - "multiqc": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] - }, - "samtools/flagstat": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["bam_stats_samtools"] - }, - "samtools/idxstats": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["bam_stats_samtools"] - }, - "samtools/index": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["bam_dedup_stats_samtools_umitools"] - }, - "samtools/merge": { - "branch": "master", - "git_sha": "e7ce60acc8a33fa17429e966364657a63016e870", - "installed_by": ["modules"] - }, - "samtools/stats": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["bam_stats_samtools"] - }, - "star/align": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] - }, - "subread/featurecounts": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] - }, - "umitools/dedup": { - "branch": "master", - "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["bam_dedup_stats_samtools_umitools"] - } - } - }, - "subworkflows": { - "nf-core": { - "bam_dedup_stats_samtools_umitools": { - "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] - }, - "bam_stats_samtools": { - "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["bam_dedup_stats_samtools_umitools"] - } - } - } - } - } - } - \ No newline at end of file diff --git a/nextflow.config b/nextflow.config deleted file mode 100644 index 7952bf9d..00000000 --- a/nextflow.config +++ /dev/null @@ -1,101 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - UMCUGenetics/DxNextflowRNA Nextflow config file -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -manifest { - name = 'UMCUGenetics/DxNextflowRNA' - author = """UMCU Genetics""" - homePage = 'https://github.com/UMCUGenetics/DxNextflowRNA' - description = """UMCU Genetics RNA Workflow""" - mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' - version = '1.0dev' - doi = '' -} - -// Nextflow plugins -plugins { - id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet -} - -// Global default params, used in configs -params { - genome = '/hpc/diaggen/data/databases/ref_genomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta' - star_index = '/hpc/diaggen/data/databases/STAR_ref_genome_index/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set_2.7.9a' - gtf = '/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf' - - // Input options - input = null - - // Output options - outdir = null - publish_dir_mode = 'link' - email = null - - // cluster_options = "--mail-user $params.email --mail-type FAIL --account=diaggen" -} - -// Capture exit codes from upstream processes when piping -process.shell = ['/bin/bash', '-euo', 'pipefail'] - -// Load base.config -includeConfig 'conf/base.config' - -// Load modules.config for module specific options -includeConfig 'conf/modules.config' - -report { - enabled = true - file = "$params.outdir/log/nextflow_report.html" - overwrite = true -} - -trace { - enabled = true - file = "$params.outdir/log/nextflow_trace.txt" - overwrite = true -} - -timeline { - enabled = true - file = "$params.outdir/log/nextflow_timeline.html" - overwrite = true -} - -profiles { - - slurm { - process { - executor = 'slurm' - queue = 'cpu' - - errorStrategy = 'retry' - maxRetries = 1 - } - - singularity { - enabled = true - runOptions = '-B /hpc:/hpc -B $TMPDIR:$TMPDIR' - autoMounts = true - cacheDir = '/hpc/diaggen/software/singularity_cache' - } - - executor { - queueSize = 1000 - pollInterval = '1min' - queueStatInterval = '5min' - submitRatelimit = '10sec' - } - - mail { - smtp.host = 'localhost' - } - } - - mac { - docker.enabled = true - docker.runOptions = '-v /Users:/Users' - } -} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json deleted file mode 100644 index c5db6a24..00000000 --- a/nextflow_schema.json +++ /dev/null @@ -1,97 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/UMCUGenetics/DxNextflowRNA/master/nextflow_schema.json", - "title": "UMCUGenetics/DxNextflowRNA pipeline parameters", - "description": "UMCU Genetics RNA Workflow", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], - "properties": { - "input": { - "type": "string", - "format": "directory-path", - "exists": true, - "description": "Path to input data.", - "fa_icon": "fas fa-folder-open" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved.", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - } - } - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Path of genome reference.", - "fa_icon": "fas fa-book", - "help_text": "" - }, - "gtf": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.gtf(\\.gz)?$", - "description": "Path to GTF annotation file.", - "fa_icon": "fas fa-code-branch", - "help_text": "This parameter is *mandatory* if `--genome` is not specified." - }, - "star_index": { - "type": "string", - "format": "path", - "exists": true, - "fa_icon": "fas fa-bezier-curve", - "description": "Path to directory or tar.gz archive for pre-built STAR index." - } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "properties": { - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], - "hidden": true - } - } - } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/generic_options" - } - ] - } - \ No newline at end of file diff --git a/run_nextflow.sh b/run_nextflow.sh deleted file mode 100644 index 69ad309f..00000000 --- a/run_nextflow.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -set -euo pipefail - -workflow_path='/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules' - -# Set input and output dirs -input=`realpath $1` -output=`realpath $2` -email=$3 - -mkdir -p $output && cd $output -mkdir -p log - -if ! { [ -f 'workflow.running' ] || [ -f 'workflow.done' ] || [ -f 'workflow.failed' ]; }; then -touch workflow.running -echo "check directory for output: ${output}" - -export JAVA_HOME='/hpc/diaggen/software/tools/jdk-18.0.2.1/' # change java version -export NXF_JAVA_HOME='/hpc/diaggen/software/tools/jdk-18.0.2.1/' # change java vesion of nextflow - -sbatch < Date: Tue, 9 Jan 2024 16:40:39 +0100 Subject: [PATCH 11/31] outrider.R with default output dir and correct R script call in main.nf at NexflowModules --- Outrider/1.20.0/main.nf | 20 +++++++++----------- Outrider/1.20.0/outrider.R | 4 ++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 64a23ddd..e8e08276 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -1,18 +1,16 @@ - process OUTRIDER { - tag "$meta.id" +// tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-outrider:1.20.0--r43hf17093f_0 ' : - 'biocontainers/bioconductor-outrider:1.20.0--r43hf17093f_0 ' }" + container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - tuple val(meta), path(counts), path(ref) - + path(counts) + path(ref) + output: - tuple val(meta), path("*.tsv"), emit: tsv + path("*.tsv"), emit: tsv // path "versions.yml" , emit: versions /* when: @@ -20,9 +18,9 @@ process OUTRIDER { script: """ - echo $counts - echo $ref - Rscript "outrider.R" "$counts" "/out" -r "$ref" +# mkdir -p ${params.outdir}/outrider +# Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -o "${params.outdir}/outrider/" -r "${ref}" + Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -r "${ref}" """ } \ No newline at end of file diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 2cd08598..aefecd56 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -12,8 +12,8 @@ suppressPackageStartupMessages(library("dplyr")) parser <- ArgumentParser(description = "Process some integers") parser$add_argument("query", metavar = "query_input_files", nargs = "+", help = "Files or directories containing the query input files.") -parser$add_argument("output_path", metavar = "output_path", nargs = "+", - help = "Path where output of OUTRIDER will be stored.") +parser$add_argument("-o","--output_path", metavar = "output_path", nargs = "+", + help = "Path where output of OUTRIDER will be stored.", default="./") parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", help = "Files or directories containing the reference input files.") args <- parser$parse_args() From 388de3536eaaf9938eeb591897e547a69de05ba0 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Mon, 15 Jan 2024 16:19:27 +0100 Subject: [PATCH 12/31] Edit path to outrider dir --- Outrider/1.20.0/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index e8e08276..4aae5175 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -18,9 +18,9 @@ process OUTRIDER { script: """ -# mkdir -p ${params.outdir}/outrider -# Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -o "${params.outdir}/outrider/" -r "${ref}" - Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -r "${ref}" + Rscript "${moduleDir}/outrider.R" "${counts}" -r "${ref}" """ +// mkdir -p ${params.outdir}/outrider +// Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -o "${params.outdir}/outrider/" -r "${ref}" } \ No newline at end of file From fdaaa3ba9b3060875b823bbcda2ccd42f898132a Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 6 Feb 2024 15:34:07 +0100 Subject: [PATCH 13/31] Outrider main.nf added tag meta.id and param feature exon/gene --- Outrider/1.20.0/main.nf | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 4aae5175..a4f88083 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -1,14 +1,15 @@ process OUTRIDER { -// tag "$meta.id" + tag "$meta.id" label 'process_medium' conda "${moduleDir}/environment.yml" container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - path(counts) + tuple val(meta), path(counts) path(ref) - + val(feature) + output: path("*.tsv"), emit: tsv // path "versions.yml" , emit: versions @@ -18,9 +19,10 @@ process OUTRIDER { script: """ - Rscript "${moduleDir}/outrider.R" "${counts}" -r "${ref}" + echo "${meta}" + echo "${ref}" + echo "${counts}" + Rscript ${moduleDir}/outrider.R ${counts} -r ${ref} -f ${feature} """ -// mkdir -p ${params.outdir}/outrider -// Rscript "/hpc/diaggen/users/lonneke/github/DxNextflowRNA/NextflowModules/Outrider/1.20.0/outrider.R" "${counts}" -o "${params.outdir}/outrider/" -r "${ref}" } \ No newline at end of file From fdb3a8018df50a947237ac2a114517a9dd8fed4c Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 6 Feb 2024 15:35:55 +0100 Subject: [PATCH 14/31] outrider.R added argument feature to use in output tsv file, also as parameter in functions main and save_output --- Outrider/1.20.0/outrider.R | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index aefecd56..73d7925d 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -10,17 +10,19 @@ suppressPackageStartupMessages(library("dplyr")) # Argument parser parser <- ArgumentParser(description = "Process some integers") -parser$add_argument("query", metavar = "query_input_files", nargs = "+", +parser$add_argument("query", metavar = "query_input_files", nargs = "+", help = "Files or directories containing the query input files.") -parser$add_argument("-o","--output_path", metavar = "output_path", nargs = "+", +parser$add_argument("-o","--output_path", metavar = "output_path", nargs = "+", help = "Path where output of OUTRIDER will be stored.", default="./") -parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", +parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", help = "Files or directories containing the reference input files.") +parser$add_argument("-f", "--feat", metavar = "feature", nargs = "+", + help = "Choose gene or exon level. Gene is default.", default="gene") args <- parser$parse_args() read_input_files <- function(input){ - sampleIDs <- c() + sampleIDs <- c() count_tables <- lapply(input, function(f) { input_ext <- tools::file_ext(f) if(input_ext == "Rds") { @@ -79,23 +81,24 @@ run_outrider <- function(all_counts) { } -save_output <- function(out_path, out_outrider, ref_samples, padj_thres=0.05, zscore_thres=0, all=True) { +save_output <- function(out_path, out_outrider, ref_samples, feature, padj_thres=0.05, zscore_thres=0, all=True) { # res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) # Reference samples are excluded from final results. query_res <- filter(res, !(sampleID %in% ref_samples)) # Write output table with aberrant expressed targets. - write_tsv(query_res, paste0(out_path, "outrider_result.tsv")) + write_tsv(query_res, paste0(out_path, "outrider_result_", feature, ".tsv")) } # TODO: investigate memory usage and if possible reduced / parallel. -main <- function(query, ref, output_path){ +main <- function(query, ref, output_path, feature){ query_data <- get_input(query) ref_data <- get_input(ref) all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) output <- run_outrider(all_counts) - save_output(output_path, output, ref_data$sampleIDs) + save_output(output_path, output, ref_data$sampleIDs, feature) } -main(args$query, args$ref, args$output_path) \ No newline at end of file + +main(args$query, args$ref, args$output_path, args$feat) From 808c14865fa24ebf168fd6e88ae97c29da28ad68 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 6 Feb 2024 16:25:51 +0100 Subject: [PATCH 15/31] Added changes for feature and output in meta.yml file --- Outrider/1.20.0/meta.yml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/Outrider/1.20.0/meta.yml b/Outrider/1.20.0/meta.yml index 54155e70..e856b6c9 100644 --- a/Outrider/1.20.0/meta.yml +++ b/Outrider/1.20.0/meta.yml @@ -6,21 +6,32 @@ keywords: tools: - outrider: description: | - homepage: - documentation: - licence: + homepage: + documentation: + licence: input: - meta: type: map description: | Groovy Map containing sample information + - counts: + type: file + description: | + Input file to measure aberrant expression + - ref: + type: file + description: | + List of files as a reference to measure aberrant expression to + - feature: + type: value + description: | + Value for the feature exon or gene to be used in the output file output: - meta: - type: map + type: file description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + tsv file with gene or exon values regarding aberrant expression authors: - - + - maintainers: - - + - From af534aeae5094f5d98142650cff1091f0b0311cc Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Mon, 19 Feb 2024 16:43:04 +0100 Subject: [PATCH 16/31] Outrider main.nf changed input feature, input refset gene and exon and choose accordingly --- Outrider/1.20.0/main.nf | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index a4f88083..0f86537d 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -1,28 +1,31 @@ process OUTRIDER { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "${moduleDir}/environment.yml" container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - tuple val(meta), path(counts) - path(ref) - val(feature) + tuple val(meta), path(counts), val(feature) + path(ref_gene) + path(ref_exon) output: path("*.tsv"), emit: tsv -// path "versions.yml" , emit: versions + path "versions.yml" , emit: versions - /* when: - task.ext.when == null || task.ext.when*/ + when: + task.ext.when == null || task.ext.when script: + def refset = ref_gene + if (feature == 'exon') { + refset = ref_exon + } """ echo "${meta}" - echo "${ref}" echo "${counts}" - Rscript ${moduleDir}/outrider.R ${counts} -r ${ref} -f ${feature} + echo "${refset}" + Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -f ${feature} """ - } \ No newline at end of file From cefbef6bf4f7dc1f3ab50d812a98f6c449c4083c Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:19:41 +0100 Subject: [PATCH 17/31] Added filtering, findEncodingDim, controlForConfounders, sampleExclusionMask, fitting, pvalue and zscore calculation to run outrider --- Outrider/1.20.0/outrider.R | 62 +++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 73d7925d..711ef37d 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -31,9 +31,10 @@ read_input_files <- function(input){ return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) } else if (input_ext %in% c("txt", "tsv")) { count_table <- read_delim(f, show_col_types=FALSE, skip=1) + # col 1: samples IDs, col 7: counts ct <- count_table[,c(1,7)] names(ct)[1] <- "rownames" - sampleIDs <- append(sampleIDs, colnames(count_table)) + sampleIDs <<- append(sampleIDs, colnames(ct)[2]) return(ct) } else { stop("Input file extension is not supported.") @@ -75,17 +76,64 @@ run_outrider <- function(all_counts) { rownames(all_counts_matrix) <- all_counts$rownames ods <- OutriderDataSet(countData=all_counts_matrix) - filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) - out <- OUTRIDER(filtered) + filtered <- filterExpression(ods, minCounts = FALSE, filterGenes = FALSE, gtfFile="/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf") + filtered <- filtered[mcols(filtered)$passedFilter,] +# filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) + + ods <- estimateSizeFactors(filtered) + print(ods$sizeFactor) + + ##Drop pipeline + #maxTestedDimensionProportion + mp <- 3 + implementation <- "autoencoder" + + a <- 5 + b <- min(ncol(ods), nrow(ods)) / mp # N/3 + maxSteps <- 15 + if(mp < 4){ + maxSteps <- 20 + } + + Nsteps <- min(maxSteps, b) # Do at most 20 steps or N/3 + # Do unique in case 2 were repeated + + pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique + #help suggestion: + #pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) + print(pars_q) + + ods <- findEncodingDim(ods, params = pars_q, implementation = implementation, BPPARAM=MulticoreParam(8)) + + print("getbestQ") + opt_q <- getBestQ(ods) + print(opt_q) + + ods <- controlForConfounders(ods, q=opt_q, implementation=implementation, BPPARAM=MulticoreParam(8)) + + #exclude sample of interest from the fitting bc of possible replicates.. + #message(date(), ": sample Exclusion Mask ...") + #sampleExclusionMask(ods) <- FALSE + #sampleExclusionMask(ods[,c(1)]) <- TRUE + + #if(grepl("^(peer|pca)$", implementation)){ + # message(date(), ": Fitting the data ...") + # ods <- fit(ods, BPPARAM=MulticoreParam(8)) + #} + + ods <- computePvalues(ods, alternative="two.sided", method="BY") + out <- computeZscores(ods) + +# out <- OUTRIDER(filtered, BPPARAM=MulticoreParam(8)) return(out) } -save_output <- function(out_path, out_outrider, ref_samples, feature, padj_thres=0.05, zscore_thres=0, all=True) { -# res <- as_tibble(results(ods, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) - res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=TRUE)) +save_output <- function(out_path, out_outrider, ref_samples, feature, padj_thres=0.05, zscore_thres=0, a=TRUE) { + res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. query_res <- filter(res, !(sampleID %in% ref_samples)) + #query_res <- query_res[order(query_res$pValue),] # Write output table with aberrant expressed targets. write_tsv(query_res, paste0(out_path, "outrider_result_", feature, ".tsv")) } @@ -101,4 +149,4 @@ main <- function(query, ref, output_path, feature){ } -main(args$query, args$ref, args$output_path, args$feat) +main(args$query, args$ref, args$output_path, args$feat) \ No newline at end of file From abae593ac5c1855015df1b2c5c5778992db762cf Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:53:22 +0100 Subject: [PATCH 18/31] versions.yml error --- Outrider/1.20.0/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 0f86537d..72695ac0 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -12,7 +12,7 @@ process OUTRIDER { output: path("*.tsv"), emit: tsv - path "versions.yml" , emit: versions + //path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when From 22b3d9aea58d67ef1fa6b9f1cfd79f0e83ab9e9e Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:56:24 +0100 Subject: [PATCH 19/31] full result, no exclusion mask and fit --- Outrider/1.20.0/outrider.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 711ef37d..bc53300d 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -132,8 +132,8 @@ run_outrider <- function(all_counts) { save_output <- function(out_path, out_outrider, ref_samples, feature, padj_thres=0.05, zscore_thres=0, a=TRUE) { res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. - query_res <- filter(res, !(sampleID %in% ref_samples)) - #query_res <- query_res[order(query_res$pValue),] + #query_res <- filter(res, !(sampleID %in% ref_samples)) + query_res <- res # Write output table with aberrant expressed targets. write_tsv(query_res, paste0(out_path, "outrider_result_", feature, ".tsv")) } From 86487064cdcccdd660b109f0a677c0b0bca923cc Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:00:52 +0100 Subject: [PATCH 20/31] Changes regarding separate outrider processes on GENE & EXON level with in ext.prefix --- Outrider/1.20.0/main.nf | 17 +++++------------ Outrider/1.20.0/outrider.R | 32 +++++++++++++++++--------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 72695ac0..847fc806 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -6,26 +6,19 @@ process OUTRIDER { container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - tuple val(meta), path(counts), val(feature) - path(ref_gene) - path(ref_exon) + tuple val(meta), path(counts) + path(refset) output: path("*.tsv"), emit: tsv - //path "versions.yml" , emit: versions +// path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def refset = ref_gene - if (feature == 'exon') { - refset = ref_exon - } + def prefix = task.ext.prefix ?: "${meta.id}" """ - echo "${meta}" - echo "${counts}" - echo "${refset}" - Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -f ${feature} + Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -p ${prefix} """ } \ No newline at end of file diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index bc53300d..6f5f6d1d 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -16,8 +16,8 @@ parser$add_argument("-o","--output_path", metavar = "output_path", nargs = "+", help = "Path where output of OUTRIDER will be stored.", default="./") parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+", help = "Files or directories containing the reference input files.") -parser$add_argument("-f", "--feat", metavar = "feature", nargs = "+", - help = "Choose gene or exon level. Gene is default.", default="gene") +parser$add_argument("-p", "--pref", metavar = "prefix", nargs = "+", + help = "Prefix of file.", default="gene") args <- parser$parse_args() @@ -97,24 +97,25 @@ run_outrider <- function(all_counts) { Nsteps <- min(maxSteps, b) # Do at most 20 steps or N/3 # Do unique in case 2 were repeated - pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique + #help suggestion: #pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) print(pars_q) ods <- findEncodingDim(ods, params = pars_q, implementation = implementation, BPPARAM=MulticoreParam(8)) - print("getbestQ") + #find best q (dimension) opt_q <- getBestQ(ods) + print("getbestQ") print(opt_q) + + ##exclude sample of interest from the fitting bc of possible replicates.. + message(date(), ": sample Exclusion Mask ...") + sampleExclusionMask(ods) <- FALSE + sampleExclusionMask(ods[,c(1)]) <- TRUE ods <- controlForConfounders(ods, q=opt_q, implementation=implementation, BPPARAM=MulticoreParam(8)) - - #exclude sample of interest from the fitting bc of possible replicates.. - #message(date(), ": sample Exclusion Mask ...") - #sampleExclusionMask(ods) <- FALSE - #sampleExclusionMask(ods[,c(1)]) <- TRUE #if(grepl("^(peer|pca)$", implementation)){ # message(date(), ": Fitting the data ...") @@ -129,24 +130,25 @@ run_outrider <- function(all_counts) { } -save_output <- function(out_path, out_outrider, ref_samples, feature, padj_thres=0.05, zscore_thres=0, a=TRUE) { +save_output <- function(out_path, out_outrider, ref_samples, prefix, padj_thres=0.05, zscore_thres=0, a=TRUE) { res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. - #query_res <- filter(res, !(sampleID %in% ref_samples)) +# query_res <- filter(res, !(sampleID %in% ref_samples)) query_res <- res + #query_res <- query_res[order(query_res$pValue),] # Write output table with aberrant expressed targets. - write_tsv(query_res, paste0(out_path, "outrider_result_", feature, ".tsv")) + write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) } # TODO: investigate memory usage and if possible reduced / parallel. -main <- function(query, ref, output_path, feature){ +main <- function(query, ref, output_path, prefix){ query_data <- get_input(query) ref_data <- get_input(ref) all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) output <- run_outrider(all_counts) - save_output(output_path, output, ref_data$sampleIDs, feature) + save_output(output_path, output, ref_data$sampleIDs, prefix) } -main(args$query, args$ref, args$output_path, args$feat) \ No newline at end of file +main(args$query, args$ref, args$output_path, args$pref) \ No newline at end of file From 2acf15156fe8a844a7aea5fe46d4ddbed0fbb94b Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 18 Jun 2024 17:52:25 +0200 Subject: [PATCH 21/31] Outrider.R additions to generate input files for EMC app. Added separate filtering for exon level and ratio fpkm --- Outrider/1.20.0/main.nf | 2 + Outrider/1.20.0/outrider.R | 187 +++++++++++++++++++++++++++---------- 2 files changed, 142 insertions(+), 47 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 847fc806..6797671f 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -11,6 +11,8 @@ process OUTRIDER { output: path("*.tsv"), emit: tsv + path("*.csv"), emit: csv + path("*.pdf"), emit: pdf // path "versions.yml" , emit: versions when: diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 6f5f6d1d..b5da6ba7 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -7,6 +7,7 @@ suppressPackageStartupMessages(library("tools")) suppressPackageStartupMessages(library("tibble")) suppressPackageStartupMessages(library("readr")) suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("stringr")) # Argument parser parser <- ArgumentParser(description = "Process some integers") @@ -31,8 +32,8 @@ read_input_files <- function(input){ return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) } else if (input_ext %in% c("txt", "tsv")) { count_table <- read_delim(f, show_col_types=FALSE, skip=1) - # col 1: samples IDs, col 7: counts - ct <- count_table[,c(1,7)] + # col 1: samples IDs, col 9: counts + ct <- count_table[,c(1,9)] names(ct)[1] <- "rownames" sampleIDs <<- append(sampleIDs, colnames(ct)[2]) return(ct) @@ -55,7 +56,7 @@ get_input <- function(input){ } else { # If both dir and files are provided, or different tyes (character, int etc) stop("Input is neither dir or file.") } - + return(read_input_files(retrieved_files)) } @@ -68,74 +69,160 @@ merge_count_tables <- function(lst_query, lst_ref){ } -run_outrider <- function(all_counts) { +run_outrider <- function(all_counts, query, prefix) { # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. - # filter expression all_counts_matrix <- as.matrix(all_counts)[,-1] mode(all_counts_matrix) <- "integer" rownames(all_counts_matrix) <- all_counts$rownames ods <- OutriderDataSet(countData=all_counts_matrix) - filtered <- filterExpression(ods, minCounts = FALSE, filterGenes = FALSE, gtfFile="/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf") - filtered <- filtered[mcols(filtered)$passedFilter,] -# filtered <- filterExpression(ods, minCounts = TRUE, filterGenes = TRUE) - - ods <- estimateSizeFactors(filtered) - print(ods$sizeFactor) - ##Drop pipeline - #maxTestedDimensionProportion - mp <- 3 - implementation <- "autoencoder" + plotheat = "counts_heatplots.pdf" + pdf(plotheat,onefile = TRUE) - a <- 5 - b <- min(ncol(ods), nrow(ods)) / mp # N/3 - maxSteps <- 15 - if(mp < 4){ - maxSteps <- 20 + if(grepl("gene", prefix)){ + ##FOR GENE LEVEL + ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile="/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf") } + else{ + ##FOR EXON LEVEL + ct <- read_delim(query, show_col_types=FALSE, skip=1) + print("identical exon ids:") + print(identical(ct$Geneid,rownames(assays(ods)$counts))) + readD <- apply(assays(ods)$counts, 2, function(x) x / sum(x) * 10^6) + countsFPKM <- readD / ct$Length * 10^3 + perc95e <- apply(countsFPKM, 1, function(x) quantile(x,probs=0.95)) + mcols(ods)$passedFilter <- perc95e>1 + mcols(ods)$basepairs <- ct$Length + + +##FOR RATIO FPKM +# print("RATIO FPKM") +# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) +# } +# display the FPKM distribution of counts. +# plotFPKM(ods) + message(date(), ": dim before filtering...") + print(dim(assays(ods)$counts)) + +##SUBSETTING + ods <- ods[mcols(ods)$passedFilter,] + message(date(), ": dim after filtering...") + print(dim(assays(ods)$counts)) + +# Heatmap of the sample correlation +# it can also annotate the clusters resulting from the dendrogram + ods <- plotCountCorHeatmap(ods, normalized=FALSE, nRowCluster=4) + +# Heatmap of the gene/sample expression + ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE, nRowCluster=4) + + ods <- estimateSizeFactors(ods) + message(date(), ": sizeFactors...") + print(ods$sizeFactor) - Nsteps <- min(maxSteps, b) # Do at most 20 steps or N/3 - # Do unique in case 2 were repeated - pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique - - #help suggestion: - #pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) - print(pars_q) - - ods <- findEncodingDim(ods, params = pars_q, implementation = implementation, BPPARAM=MulticoreParam(8)) + implementation <- "autoencoder" + pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) + message(date(), ": findEncodingDim...") + ods <- findEncodingDim(ods, params = pars_q, implementation=implementation, BPPARAM=MulticoreParam(8)) + #find best q (dimension) opt_q <- getBestQ(ods) - print("getbestQ") + message(date(), ": getbestQ...") print(opt_q) - - ##exclude sample of interest from the fitting bc of possible replicates.. - message(date(), ": sample Exclusion Mask ...") - sampleExclusionMask(ods) <- FALSE - sampleExclusionMask(ods[,c(1)]) <- TRUE - ods <- controlForConfounders(ods, q=opt_q, implementation=implementation, BPPARAM=MulticoreParam(8)) + ods <- controlForConfounders(ods, q = opt_q, BPPARAM = MulticoreParam(8)) + +# After controlling for confounders the heatmap should be plotted again. +# If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* + ods <- plotCountCorHeatmap(ods, normalized=TRUE) + dev.off() - #if(grepl("^(peer|pca)$", implementation)){ - # message(date(), ": Fitting the data ...") - # ods <- fit(ods, BPPARAM=MulticoreParam(8)) - #} +# if(grepl("^(peer|pca)$", implementation)){ +# message(date(), ": Fitting the data ...") # NOT with autoencoder +# ods <- fit(ods, BPPARAM=MulticoreParam(8)) +# } - ods <- computePvalues(ods, alternative="two.sided", method="BY") + ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) out <- computeZscores(ods) -# out <- OUTRIDER(filtered, BPPARAM=MulticoreParam(8)) +# run full OUTRIDER pipeline (control, fit model, calculate P-values) +# out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) return(out) } -save_output <- function(out_path, out_outrider, ref_samples, prefix, padj_thres=0.05, zscore_thres=0, a=TRUE) { - res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) +##Only necessary for Erasmus app +get_ct_emcapp <- function(query, prefix){ + count_table <- read_delim(query, show_col_types=FALSE, skip=1) + ct <- count_table[,c(2,3,4,8,1)] + colnames(ct)<- c("chr","start","end","gene","geneID") + ct$chr <- str_split_fixed(ct$chr,';',2)[,1] + ct$start <- str_split_fixed(ct$start,';',2)[,1] + ct$end <- gsub("^.*;", "", ct$end) + + ##For gene level: create countdata table Identifier chr_start_end_genename + ct$emcID <- paste(ct$chr,ct$start,ct$end,ct$gene,sep="_") + ##For exon level: create countdata table Identifier: add Ensemble exon_id: chr_start_end_genename-exon_id + if(grepl("exon", prefix)){ + ct$emcID <- paste(ct$emcID,ct$geneID,sep="-") + } + + return(ct) +} + + +##Only necessary for Erasmus app +save_res_emcapp <- function(ct, res, prefix, out_path){ + res <- as_tibble(results(res, all=TRUE))[,c(2,1,3:14)] + res_merge <- merge(res, ct, by = "geneID", all.res = TRUE) + res_merge$geneENSG <- res_merge$geneID + res_merge$geneID <- res_merge$emcID + res_merge$sample_name<-NA + res_merge$TIN_mean<-NA + res_merge$link_bam<-NA + + fragment <- "genes" + treatment <- "untreated" + if(grepl("exon",prefix)){ fragment <- "exons"} + if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + write_csv(res_merge[,c(1:18,20:23)], paste0(out_path, "umcu_rnaseq_fib_",treatment,"_res_outrider_",fragment,"_counts.tsv"), append=FALSE, col_names = TRUE) +} + +##Only necessary for Erasmus app +save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ + ct <- ct[,c("emcID","geneID")] + colnames(all_counts)[1]<-c("geneID") + merge_counts <- merge(all_counts, ct, by = "geneID", all.ct = TRUE) + merge_counts$geneID <- merge_counts$emcID + counts_out <- merge_counts[,c(1:ncol(merge_counts)-1)] + colnames(counts_out)[1]<-c("") + + ##countdata output table EMC app + if(grepl("gene", prefix)){ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_genes_counts.tsv")) + }else{ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_exons_counts.tsv"))} + + ##Meta output table EMC app + treatment <- "untreated" + if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") + colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") + if(grepl("gene",prefix)){ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_genes.csv"), row.names = FALSE, quote=FALSE) + }else{ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE)} +} + +save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { +# res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. # query_res <- filter(res, !(sampleID %in% ref_samples)) + res <- as_tibble(results(out_outrider, all=a)) query_res <- res - #query_res <- query_res[order(query_res$pValue),] + # Write output table with aberrant expressed targets. write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) } @@ -146,8 +233,14 @@ main <- function(query, ref, output_path, prefix){ query_data <- get_input(query) ref_data <- get_input(ref) all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) - output <- run_outrider(all_counts) - save_output(output_path, output, ref_data$sampleIDs, prefix) + + output <- run_outrider(all_counts, query, prefix) + save_output(output_path, output, ref_data$sampleIDs, prefix, query) + + ##Only necessary for Erasmus app, to get the counts, meta and outrider results tables + ct_emcapp <- get_ct_emcapp(query, prefix) + save_count_meta_emcapp(ct_emcapp, all_counts, output_path, prefix) + save_res_emcapp(ct_emcapp, output, prefix, output_path) } From ccf831fe817454763ed6e4eb0fbf97a131e6479b Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Tue, 18 Jun 2024 20:43:45 +0200 Subject: [PATCH 22/31] forgot } --- Outrider/1.20.0/outrider.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index b5da6ba7..3dd5c624 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -95,11 +95,11 @@ run_outrider <- function(all_counts, query, prefix) { mcols(ods)$passedFilter <- perc95e>1 mcols(ods)$basepairs <- ct$Length + ##FOR RATIO FPKM +# print("RATIO FPKM") +# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) + } -##FOR RATIO FPKM -# print("RATIO FPKM") -# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) -# } # display the FPKM distribution of counts. # plotFPKM(ods) message(date(), ": dim before filtering...") From dd36ce0d695aeaa2137696faefb38ea1fe5fbbf6 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Wed, 19 Jun 2024 11:09:01 +0200 Subject: [PATCH 23/31] filter_expression as a separate function in outrider.R --- Outrider/1.20.0/outrider.R | 50 +++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 3dd5c624..89007fb9 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -69,17 +69,7 @@ merge_count_tables <- function(lst_query, lst_ref){ } -run_outrider <- function(all_counts, query, prefix) { - # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. - all_counts_matrix <- as.matrix(all_counts)[,-1] - mode(all_counts_matrix) <- "integer" - rownames(all_counts_matrix) <- all_counts$rownames - - ods <- OutriderDataSet(countData=all_counts_matrix) - - plotheat = "counts_heatplots.pdf" - pdf(plotheat,onefile = TRUE) - +filter_expression <- function(ods, query, prefix){ if(grepl("gene", prefix)){ ##FOR GENE LEVEL ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile="/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf") @@ -96,25 +86,41 @@ run_outrider <- function(all_counts, query, prefix) { mcols(ods)$basepairs <- ct$Length ##FOR RATIO FPKM -# print("RATIO FPKM") -# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) +# print("RATIO FPKM") +# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) } -# display the FPKM distribution of counts. -# plotFPKM(ods) + # display the FPKM distribution of counts. + # plotFPKM(ods) message(date(), ": dim before filtering...") print(dim(assays(ods)$counts)) -##SUBSETTING + ##SUBSETTING ods <- ods[mcols(ods)$passedFilter,] message(date(), ": dim after filtering...") print(dim(assays(ods)$counts)) + return(ods) +} + + +run_outrider <- function(all_counts, query, prefix) { + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + all_counts_matrix <- as.matrix(all_counts)[,-1] + mode(all_counts_matrix) <- "integer" + rownames(all_counts_matrix) <- all_counts$rownames + + ods <- OutriderDataSet(countData=all_counts_matrix) + + plotheat = "counts_heatplots.pdf" + pdf(plotheat,onefile = TRUE) + + ods <- filter_expression(ods, query, prefix) -# Heatmap of the sample correlation -# it can also annotate the clusters resulting from the dendrogram + # Heatmap of the sample correlation + # it can also annotate the clusters resulting from the dendrogram ods <- plotCountCorHeatmap(ods, normalized=FALSE, nRowCluster=4) -# Heatmap of the gene/sample expression + # Heatmap of the gene/sample expression ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE, nRowCluster=4) ods <- estimateSizeFactors(ods) @@ -134,8 +140,8 @@ run_outrider <- function(all_counts, query, prefix) { ods <- controlForConfounders(ods, q = opt_q, BPPARAM = MulticoreParam(8)) -# After controlling for confounders the heatmap should be plotted again. -# If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* + # After controlling for confounders the heatmap should be plotted again. + # If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* ods <- plotCountCorHeatmap(ods, normalized=TRUE) dev.off() @@ -147,7 +153,7 @@ run_outrider <- function(all_counts, query, prefix) { ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) out <- computeZscores(ods) -# run full OUTRIDER pipeline (control, fit model, calculate P-values) + # run full OUTRIDER pipeline (control, fit model, calculate P-values) # out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) return(out) } From 3e59c78eb06c564b11292a66255c4bbc488edd51 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Wed, 19 Jun 2024 13:35:36 +0200 Subject: [PATCH 24/31] chx->CHX in outrider.R --- Outrider/1.20.0/outrider.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 89007fb9..51e04cd0 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -192,10 +192,11 @@ save_res_emcapp <- function(ct, res, prefix, out_path){ fragment <- "genes" treatment <- "untreated" if(grepl("exon",prefix)){ fragment <- "exons"} - if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + if(grepl("chx|CHX",prefix)){ treatment <- "CHX"} write_csv(res_merge[,c(1:18,20:23)], paste0(out_path, "umcu_rnaseq_fib_",treatment,"_res_outrider_",fragment,"_counts.tsv"), append=FALSE, col_names = TRUE) } + ##Only necessary for Erasmus app save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ ct <- ct[,c("emcID","geneID")] @@ -213,7 +214,7 @@ save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ ##Meta output table EMC app treatment <- "untreated" - if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + if(grepl("chx|CHX",prefix)){ treatment <- "CHX"} metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") if(grepl("gene",prefix)){ @@ -222,6 +223,7 @@ save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE)} } + save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { # res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. From 5d596e5af262f8c99a00c191b90c08a1602e63fc Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Thu, 4 Jul 2024 13:30:05 +0200 Subject: [PATCH 25/31] fraser module added for aberrant splicing --- Fraser/1.99.3/environment.yml | 6 ++ Fraser/1.99.3/fraser.R | 153 ++++++++++++++++++++++++++++++++++ Fraser/1.99.3/main.nf | 26 ++++++ Outrider/1.20.0/outrider.R | 7 +- 4 files changed, 189 insertions(+), 3 deletions(-) create mode 100644 Fraser/1.99.3/environment.yml create mode 100644 Fraser/1.99.3/fraser.R create mode 100644 Fraser/1.99.3/main.nf diff --git a/Fraser/1.99.3/environment.yml b/Fraser/1.99.3/environment.yml new file mode 100644 index 00000000..c40ff5c1 --- /dev/null +++ b/Fraser/1.99.3/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 \ No newline at end of file diff --git a/Fraser/1.99.3/fraser.R b/Fraser/1.99.3/fraser.R new file mode 100644 index 00000000..137156cf --- /dev/null +++ b/Fraser/1.99.3/fraser.R @@ -0,0 +1,153 @@ + +# load FRASER library +suppressPackageStartupMessages(library(FRASER)) +suppressPackageStartupMessages(library(biomaRt)) +suppressPackageStartupMessages(library(utils)) + +args <- commandArgs() + + +get_st_rows <- function(input){ + if(dir.exists(input)){ + sampleID <- list.files(path=input,pattern="*.bam",full.names=FALSE) + bamFile <- list.files(path=input,pattern="*.bam",full.names=TRUE) + return(data.table(sampleID,bamFile)) + } + else if(file.exists(input)){ + sampleID <- basename(file.path(input)) + bamFile <- file.path(input) + return(data.table(sampleID,bamFile)) + } +} + + +create_sample_table <- function(sample,ref,paired_end){ + sampleTable <- rbind(get_st_rows(sample),get_st_rows(ref)) + sampleTable <- sampleTable[!grepl(".bai",sampleID), ] + sampleTable$group <- c(1:nrow(sampleTable)) + sampleTable$pairedEnd <- TRUE + + message(date(), ": sampleTable") + print(sampleTable) + + return(sampleTable) +} + + +run_fraser <- function(sampleTable){ + # create FRASER object + settings <- FraserDataSet(colData=sampleTable, workingDir="./FRASER_output") + message(date(), ": settings...") + print(settings) + + register(MulticoreParam(workers=min(8, multicoreWorkers()))) + + # count reads + fds <- countRNAData(settings) + + fds <- calculatePSIValues(fds) + #FRASER vignette/gagneurlab settings: + #Currently, we suggest keeping only junctions which support the following: + #minExpressionInOneSample - 20 - At least one sample has 20 (or more) reads + #quantileMinExpression - 10 - The minimum total read count (N) an intron needs to have at the specified quantile across samples to pass the filter. See quantileForFiltering. + #quantile - 0.95 - Defines at which percentile the quantileMinExpression filter is applied. + #A value of 0.95 means that at least 5% of the samples need to have a total read count N >= quantileMinExpression to pass the filter. + #minDeltaPsi - 0.05 - The minimal variation (in delta psi) required for an intron to pass the filter. gagneurlab suggests 0.05 + + fds <- filterExpressionAndVariability(fds, minExpressionInOneSample = 20, quantileMinExpression=10, quantile = 0.95, minDeltaPsi=0.05, filter=FALSE) + fds <- saveFraserDataSet(fds, name="fds-filter") +# plotheat = "plots_fraser.pdf" +# pdf(plotheat,onefile = TRUE) +# plotFilterExpression(fds, bins=100) + + message(date(), ": dimension before filtering...") + print(dim(fds)) + fds <- fds[mcols(fds, type="j")[,"passed"]] + message(date(), ": dimension after filtering...") + print(dim(fds)) + + #Finds the optimal encoding dimension by injecting artificial splicing outlier ratios while maximizing the precision-recall curve. + # Get range for latent space dimension, max tested dimensions = 6 + mp <- 6 + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- unique(round(exp(seq(log(a),log(b),length.out = Nsteps)))) + + message(date(), ": getEncDimRange", pars_q) + + psiType = "jaccard" + impl = "PCA" + + fds <- saveFraserDataSet(fds, name="fds-normfalse") + # Heatmap of the sample correlation +# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=FALSE) + # Heatmap of the intron/sample expression +# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + + #set.seed(as.integer(random)) + # hyperparameter opimization can be used to estimate the dimension q of the latent space of the data. + # It works by artificially injecting outliers into the data and then comparing the AUC of recalling these outliers for different values of q. + fds <- optimHyperParams(fds, type=psiType, implementation=impl, q_param=pars_q, minDeltaPsi=0.05, plot=FALSE) + # retrieve the estimated optimal dimension of the latent space + #currentType(fds) <- psiType + bestq = bestQ(fds, type=psiType) + message(date(), "...find best q: " ,bestq) + + # Add verbosity to the FRASER object + verbose(fds) <- 3 + fds <- fit(fds, q=bestq, type=psiType, implementation=impl, iterations=15) + + # Heatmap of the sample correlation +# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=TRUE) + # Heatmap of the intron/sample expression +# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=TRUE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + dev.off() + fds <- saveFraserDataSet(fds, name="fds-normtrue") + + #all in one getEncDimRange, optimHyperParams, bestQ, fit + #fds_fraser <- FRASER(fds_filtered, q=c(jaccard=2)) + + fds <- annotateRanges(fds, GRCh = 38) + + # Pvalues + fds <- calculatePvalues(fds, type=psiType) + # Adjust Pvalues + fds <- calculatePadjValues(fds, type=psiType) + + fds <- saveFraserDataSet(fds, name="fds-final") + + return (fds) +} + +write_fraser_output <- function(fraser_ds, prefix){ + # retrieve results with default and recommended cutoffs (padj <= 0.05 (0.1 gagneurlab) and # |deltaPsi| >= 0.3) + fraser_res <- results(fraser_ds, all=TRUE) + #res <- results(fds, psiType=psiTypes, aggregate=TRUE, collapse=FALSE, all=TRUE) + message(date(), ": result...") + print(fraser_res) + + write.table(as.data.frame(unname(fraser_res)), paste0("fraser_result_",prefix,".tsv"), sep="\t", quote=FALSE) +} + + +main <- function(args){ + ##Argument.Parser not in docker image.. + sample <- args[6] #: bam + bai sample of interest + refset <- args[7] #: dir to ref set bams + bai + prefix <- args[8] #: prefix + pairedEnd <- args[9] #: pairedEnd (=TRUE) + sampleTable <- create_sample_table(sample, refset, pairedEnd) + message(date(), ": sampleTable") + fraser_ds <- run_fraser(sampleTable) + write_fraser_output(fraser_ds, prefix) +} + + +main(args) diff --git a/Fraser/1.99.3/main.nf b/Fraser/1.99.3/main.nf new file mode 100644 index 00000000..c63568f8 --- /dev/null +++ b/Fraser/1.99.3/main.nf @@ -0,0 +1,26 @@ + +process FRASER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "quay.io/biocontainers/bioconductor-fraser:1.99.3--r43hf17093f_0" + + input: + tuple val(meta), path(input) + path(refset) + + output: + path("*.tsv"), emit: tsv +// path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + export BIOMART_CACHE=/hpc/diaggen/projects/woc/rna/container + Rscript ${moduleDir}/fraser.R ${input} ${refset} ${prefix} TRUE + """ +} \ No newline at end of file diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 51e04cd0..0b17687e 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -91,7 +91,7 @@ filter_expression <- function(ods, query, prefix){ } # display the FPKM distribution of counts. - # plotFPKM(ods) + ods <- plotFPKM(ods) message(date(), ": dim before filtering...") print(dim(assays(ods)$counts)) @@ -118,10 +118,10 @@ run_outrider <- function(all_counts, query, prefix) { # Heatmap of the sample correlation # it can also annotate the clusters resulting from the dendrogram - ods <- plotCountCorHeatmap(ods, normalized=FALSE, nRowCluster=4) + ods <- plotCountCorHeatmap(ods, normalized=FALSE) # Heatmap of the gene/sample expression - ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE, nRowCluster=4) + ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE) ods <- estimateSizeFactors(ods) message(date(), ": sizeFactors...") @@ -143,6 +143,7 @@ run_outrider <- function(all_counts, query, prefix) { # After controlling for confounders the heatmap should be plotted again. # If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* ods <- plotCountCorHeatmap(ods, normalized=TRUE) + ods <- plotCountGeneSampleHeatmap(ods, normalized=TRUE) dev.off() # if(grepl("^(peer|pca)$", implementation)){ From 12c1e1c3d0ae208ed7154260e3806e716f3f83e6 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:46:04 +0200 Subject: [PATCH 26/31] Instructions for Exon ratio code (un)commenting --- Outrider/1.20.0/outrider.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 0b17687e..2e120353 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -85,8 +85,10 @@ filter_expression <- function(ods, query, prefix){ mcols(ods)$passedFilter <- perc95e>1 mcols(ods)$basepairs <- ct$Length - ##FOR RATIO FPKM -# print("RATIO FPKM") + ##FOR EXON RATIO + ##REMOVE ##FOR EXON LEVEL code above + ##UNCOMMENT BELOW STEPS +# print("EXON RATIO") # ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) } From 931a15962cb44bbc3ad2061de5010c40b00ff28a Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:13:59 +0200 Subject: [PATCH 27/31] added returns for readability, CHX->chx --- Outrider/1.20.0/outrider.R | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 2e120353..a7781b36 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -56,7 +56,7 @@ get_input <- function(input){ } else { # If both dir and files are provided, or different tyes (character, int etc) stop("Input is neither dir or file.") } - + return(read_input_files(retrieved_files)) } @@ -161,7 +161,6 @@ run_outrider <- function(all_counts, query, prefix) { return(out) } - ##Only necessary for Erasmus app get_ct_emcapp <- function(query, prefix){ count_table <- read_delim(query, show_col_types=FALSE, skip=1) @@ -181,7 +180,6 @@ get_ct_emcapp <- function(query, prefix){ return(ct) } - ##Only necessary for Erasmus app save_res_emcapp <- function(ct, res, prefix, out_path){ res <- as_tibble(results(res, all=TRUE))[,c(2,1,3:14)] @@ -191,15 +189,14 @@ save_res_emcapp <- function(ct, res, prefix, out_path){ res_merge$sample_name<-NA res_merge$TIN_mean<-NA res_merge$link_bam<-NA - - fragment <- "genes" + + fragment <- "genes" treatment <- "untreated" if(grepl("exon",prefix)){ fragment <- "exons"} if(grepl("chx|CHX",prefix)){ treatment <- "CHX"} write_csv(res_merge[,c(1:18,20:23)], paste0(out_path, "umcu_rnaseq_fib_",treatment,"_res_outrider_",fragment,"_counts.tsv"), append=FALSE, col_names = TRUE) } - ##Only necessary for Erasmus app save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ ct <- ct[,c("emcID","geneID")] @@ -214,10 +211,10 @@ save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_genes_counts.tsv")) }else{ write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_exons_counts.tsv"))} - + ##Meta output table EMC app treatment <- "untreated" - if(grepl("chx|CHX",prefix)){ treatment <- "CHX"} + if(grepl("chx|CHX",prefix)){ treatment <- "chx"} metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") if(grepl("gene",prefix)){ @@ -226,7 +223,6 @@ save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE)} } - save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { # res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) # Reference samples are excluded from final results. @@ -238,7 +234,6 @@ save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) } - # TODO: investigate memory usage and if possible reduced / parallel. main <- function(query, ref, output_path, prefix){ query_data <- get_input(query) From 0c330def3f83835a9a08f39e46a0a8b952514844 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:21:41 +0200 Subject: [PATCH 28/31] Fraser edits for readability and logging --- Fraser/1.99.3/fraser.R | 31 ++++++++++++++++++------------- Fraser/1.99.3/meta.yml | 25 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 13 deletions(-) create mode 100644 Fraser/1.99.3/meta.yml diff --git a/Fraser/1.99.3/fraser.R b/Fraser/1.99.3/fraser.R index 137156cf..4ef6ac4b 100644 --- a/Fraser/1.99.3/fraser.R +++ b/Fraser/1.99.3/fraser.R @@ -1,4 +1,3 @@ - # load FRASER library suppressPackageStartupMessages(library(FRASER)) suppressPackageStartupMessages(library(biomaRt)) @@ -12,7 +11,7 @@ get_st_rows <- function(input){ sampleID <- list.files(path=input,pattern="*.bam",full.names=FALSE) bamFile <- list.files(path=input,pattern="*.bam",full.names=TRUE) return(data.table(sampleID,bamFile)) - } + } else if(file.exists(input)){ sampleID <- basename(file.path(input)) bamFile <- file.path(input) @@ -46,7 +45,8 @@ run_fraser <- function(sampleTable){ fds <- countRNAData(settings) fds <- calculatePSIValues(fds) - #FRASER vignette/gagneurlab settings: + + #FRASER vignette/gagneurlab settings: #Currently, we suggest keeping only junctions which support the following: #minExpressionInOneSample - 20 - At least one sample has 20 (or more) reads #quantileMinExpression - 10 - The minimum total read count (N) an intron needs to have at the specified quantile across samples to pass the filter. See quantileForFiltering. @@ -56,19 +56,22 @@ run_fraser <- function(sampleTable){ fds <- filterExpressionAndVariability(fds, minExpressionInOneSample = 20, quantileMinExpression=10, quantile = 0.95, minDeltaPsi=0.05, filter=FALSE) fds <- saveFraserDataSet(fds, name="fds-filter") -# plotheat = "plots_fraser.pdf" -# pdf(plotheat,onefile = TRUE) -# plotFilterExpression(fds, bins=100) + plotheat = "plots_fraser.pdf" + pdf(plotheat,onefile = TRUE) + plotFilterExpression(fds, bins=100) message(date(), ": dimension before filtering...") - print(dim(fds)) + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) + fds <- fds[mcols(fds, type="j")[,"passed"]] message(date(), ": dimension after filtering...") - print(dim(fds)) + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) #Finds the optimal encoding dimension by injecting artificial splicing outlier ratios while maximizing the precision-recall curve. # Get range for latent space dimension, max tested dimensions = 6 - mp <- 6 + mp <- 6 a <- 2 b <- min(ncol(fds), nrow(fds)) / mp # N/mp @@ -101,7 +104,7 @@ run_fraser <- function(sampleTable){ message(date(), "...find best q: " ,bestq) # Add verbosity to the FRASER object - verbose(fds) <- 3 + verbose(fds) <- 3 fds <- fit(fds, q=bestq, type=psiType, implementation=impl, iterations=15) # Heatmap of the sample correlation @@ -113,19 +116,21 @@ run_fraser <- function(sampleTable){ #all in one getEncDimRange, optimHyperParams, bestQ, fit #fds_fraser <- FRASER(fds_filtered, q=c(jaccard=2)) - + fds <- annotateRanges(fds, GRCh = 38) # Pvalues fds <- calculatePvalues(fds, type=psiType) # Adjust Pvalues fds <- calculatePadjValues(fds, type=psiType) + fds <- saveFraserDataSet(fds, name="fds-final") - + return (fds) } + write_fraser_output <- function(fraser_ds, prefix){ # retrieve results with default and recommended cutoffs (padj <= 0.05 (0.1 gagneurlab) and # |deltaPsi| >= 0.3) fraser_res <- results(fraser_ds, all=TRUE) @@ -150,4 +155,4 @@ main <- function(args){ } -main(args) +main(args) \ No newline at end of file diff --git a/Fraser/1.99.3/meta.yml b/Fraser/1.99.3/meta.yml new file mode 100644 index 00000000..5f1ecfa0 --- /dev/null +++ b/Fraser/1.99.3/meta.yml @@ -0,0 +1,25 @@ +name: fraser +description: Run FRASER on a set of bam files +keywords: + - fraser + - abberant splicing +tools: + - fraser: + description: FRASER - Find RAre Splicing Events in RNA-seq Data + homepage: + documentation: https://www.bioconductor.org/packages/release/bioc/vignettes/FRASER/inst/doc/FRASER.pdf + licence: +input: + - bams: + type: path + description: | + path directory where bam files are located +output: + - meta: + type: file + description: | + tsv file with aberrant spling output per sample +authors: + - +maintainers: + - \ No newline at end of file From 21dab56afca563e5f614ac4fd78221fa87aca3b3 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:21:44 +0200 Subject: [PATCH 29/31] Newline, enter and genome_gtf passed as argument --- Fraser/1.99.3/environment.yml | 3 +- Fraser/1.99.3/fraser.R | 248 ++++++++++++------------ Fraser/1.99.3/main.nf | 2 +- Fraser/1.99.3/meta.yml | 2 +- Outrider/1.20.0/main.nf | 26 +-- Outrider/1.20.0/meta.yml | 1 + Outrider/1.20.0/outrider.R | 343 +++++++++++++++++----------------- 7 files changed, 313 insertions(+), 312 deletions(-) diff --git a/Fraser/1.99.3/environment.yml b/Fraser/1.99.3/environment.yml index c40ff5c1..73a13da2 100644 --- a/Fraser/1.99.3/environment.yml +++ b/Fraser/1.99.3/environment.yml @@ -3,4 +3,5 @@ channels: - bioconda - defaults dependencies: - - bioconda::fastqc=0.12.1 \ No newline at end of file + - bioconda::fastqc=0.12.1 + \ No newline at end of file diff --git a/Fraser/1.99.3/fraser.R b/Fraser/1.99.3/fraser.R index 4ef6ac4b..1e9491b3 100644 --- a/Fraser/1.99.3/fraser.R +++ b/Fraser/1.99.3/fraser.R @@ -7,152 +7,146 @@ args <- commandArgs() get_st_rows <- function(input){ - if(dir.exists(input)){ - sampleID <- list.files(path=input,pattern="*.bam",full.names=FALSE) - bamFile <- list.files(path=input,pattern="*.bam",full.names=TRUE) - return(data.table(sampleID,bamFile)) - } - else if(file.exists(input)){ - sampleID <- basename(file.path(input)) - bamFile <- file.path(input) - return(data.table(sampleID,bamFile)) - } + if(dir.exists(input)){ + sampleID <- list.files(path=input,pattern="*.bam",full.names=FALSE) + bamFile <- list.files(path=input,pattern="*.bam",full.names=TRUE) + return(data.table(sampleID,bamFile)) + } + else if(file.exists(input)){ + sampleID <- basename(file.path(input)) + bamFile <- file.path(input) + return(data.table(sampleID,bamFile)) + } } create_sample_table <- function(sample,ref,paired_end){ - sampleTable <- rbind(get_st_rows(sample),get_st_rows(ref)) - sampleTable <- sampleTable[!grepl(".bai",sampleID), ] - sampleTable$group <- c(1:nrow(sampleTable)) - sampleTable$pairedEnd <- TRUE + sampleTable <- rbind(get_st_rows(sample),get_st_rows(ref)) + sampleTable <- sampleTable[!grepl(".bai",sampleID), ] + sampleTable$group <- c(1:nrow(sampleTable)) + sampleTable$pairedEnd <- TRUE - message(date(), ": sampleTable") - print(sampleTable) + message(date(), ": sampleTable") + print(sampleTable) - return(sampleTable) + return(sampleTable) } run_fraser <- function(sampleTable){ - # create FRASER object - settings <- FraserDataSet(colData=sampleTable, workingDir="./FRASER_output") - message(date(), ": settings...") - print(settings) - - register(MulticoreParam(workers=min(8, multicoreWorkers()))) - - # count reads - fds <- countRNAData(settings) - - fds <- calculatePSIValues(fds) - - #FRASER vignette/gagneurlab settings: - #Currently, we suggest keeping only junctions which support the following: - #minExpressionInOneSample - 20 - At least one sample has 20 (or more) reads - #quantileMinExpression - 10 - The minimum total read count (N) an intron needs to have at the specified quantile across samples to pass the filter. See quantileForFiltering. - #quantile - 0.95 - Defines at which percentile the quantileMinExpression filter is applied. - #A value of 0.95 means that at least 5% of the samples need to have a total read count N >= quantileMinExpression to pass the filter. - #minDeltaPsi - 0.05 - The minimal variation (in delta psi) required for an intron to pass the filter. gagneurlab suggests 0.05 - - fds <- filterExpressionAndVariability(fds, minExpressionInOneSample = 20, quantileMinExpression=10, quantile = 0.95, minDeltaPsi=0.05, filter=FALSE) - fds <- saveFraserDataSet(fds, name="fds-filter") - plotheat = "plots_fraser.pdf" - pdf(plotheat,onefile = TRUE) - plotFilterExpression(fds, bins=100) - - message(date(), ": dimension before filtering...") - print(dim(assays(fds)$rawCountsJ)) - print(dim(assays(fds)$rawCountsSS)) - - fds <- fds[mcols(fds, type="j")[,"passed"]] - message(date(), ": dimension after filtering...") - print(dim(assays(fds)$rawCountsJ)) - print(dim(assays(fds)$rawCountsSS)) - - #Finds the optimal encoding dimension by injecting artificial splicing outlier ratios while maximizing the precision-recall curve. - # Get range for latent space dimension, max tested dimensions = 6 - mp <- 6 - a <- 2 - b <- min(ncol(fds), nrow(fds)) / mp # N/mp - - maxSteps <- 12 - if(mp < 6){ - maxSteps <- 15 - } - - Nsteps <- min(maxSteps, b) - pars_q <- unique(round(exp(seq(log(a),log(b),length.out = Nsteps)))) - - message(date(), ": getEncDimRange", pars_q) - - psiType = "jaccard" - impl = "PCA" - - fds <- saveFraserDataSet(fds, name="fds-normfalse") - # Heatmap of the sample correlation -# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=FALSE) - # Heatmap of the intron/sample expression -# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) - - #set.seed(as.integer(random)) - # hyperparameter opimization can be used to estimate the dimension q of the latent space of the data. - # It works by artificially injecting outliers into the data and then comparing the AUC of recalling these outliers for different values of q. - fds <- optimHyperParams(fds, type=psiType, implementation=impl, q_param=pars_q, minDeltaPsi=0.05, plot=FALSE) - # retrieve the estimated optimal dimension of the latent space - #currentType(fds) <- psiType - bestq = bestQ(fds, type=psiType) - message(date(), "...find best q: " ,bestq) - - # Add verbosity to the FRASER object - verbose(fds) <- 3 - fds <- fit(fds, q=bestq, type=psiType, implementation=impl, iterations=15) - - # Heatmap of the sample correlation -# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=TRUE) - # Heatmap of the intron/sample expression -# plotCountCorHeatmap(fds, type=psiType, logit=FALSE, normalized=TRUE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) - dev.off() - fds <- saveFraserDataSet(fds, name="fds-normtrue") - - #all in one getEncDimRange, optimHyperParams, bestQ, fit - #fds_fraser <- FRASER(fds_filtered, q=c(jaccard=2)) - - fds <- annotateRanges(fds, GRCh = 38) - - # Pvalues - fds <- calculatePvalues(fds, type=psiType) - # Adjust Pvalues - fds <- calculatePadjValues(fds, type=psiType) - - - fds <- saveFraserDataSet(fds, name="fds-final") - - return (fds) + # create FRASER object + settings <- FraserDataSet(colData=sampleTable, workingDir="./FRASER_output") + message(date(), ": settings...") + print(settings) + + register(MulticoreParam(workers=min(8, multicoreWorkers()))) + + # count reads + fds <- countRNAData(settings) + + fds <- calculatePSIValues(fds) + + #FRASER vignette/gagneurlab settings: + #Currently, we suggest keeping only junctions which support the following: + #minExpressionInOneSample - 20 - At least one sample has 20 (or more) reads + #quantileMinExpression - 10 - The minimum total read count (N) an intron needs to have at the specified quantile across samples to pass the filter. See quantileForFiltering. + #quantile - 0.95 - Defines at which percentile the quantileMinExpression filter is applied. # nolint + #A value of 0.95 means that at least 5% of the samples need to have a total read count N >= quantileMinExpression to pass the filter. + #minDeltaPsi - 0.05 - The minimal variation (in delta psi) required for an intron to pass the filter. gagneurlab suggests 0.05 + + fds <- filterExpressionAndVariability(fds, minExpressionInOneSample = 20, quantileMinExpression=10, quantile = 0.95, minDeltaPsi=0.05, filter=FALSE) + fds <- saveFraserDataSet(fds, name="fds-filter") + plotheat = "plots_fraser.pdf" + pdf(plotheat,onefile = TRUE) + plotFilterExpression(fds, bins=100) + + message(date(), ": dimension before filtering...") + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) + + fds <- fds[mcols(fds, type="j")[,"passed"]] + message(date(), ": dimension after filtering...") + print(dim(assays(fds)$rawCountsJ)) + print(dim(assays(fds)$rawCountsSS)) + + #Finds the optimal encoding dimension by injecting artificial splicing outlier ratios while maximizing the precision-recall curve. + # Get range for latent space dimension, max tested dimensions = 6 + mp <- 6 + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- unique(round(exp(seq(log(a),log(b),length.out = Nsteps)))) + + message(date(), ": getEncDimRange", pars_q) + + fds <- saveFraserDataSet(fds, name="fds-normfalse") + # Heatmap of the sample correlation + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=FALSE) + # Heatmap of the intron/sample expression + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + + #set.seed(as.integer(random)) + # hyperparameter opimization can be used to estimate the dimension q of the latent space of the data. + # It works by artificially injecting outliers into the data and then comparing the AUC of recalling these outliers for different values of q. + fds <- optimHyperParams(fds, type="jaccard", implementation="PCA", + q_param=pars_q, minDeltaPsi=0.05, plot=FALSE) + # retrieve the estimated optimal dimension of the latent space + bestq = bestQ(fds, type="jaccard") + message(date(), "...find best q: " ,bestq) + + # Add verbosity to the FRASER object + verbose(fds) <- 3 + fds <- fit(fds, q=bestq, type="jaccard", implementation="PCA", iterations=15) + + # Heatmap of the sample correlation + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=TRUE) + # Heatmap of the intron/sample expression + # plotCountCorHeatmap(fds, type="jaccard", logit=FALSE, normalized=TRUE, plotType="junctionSample", topJ=100, minDeltaPsi=0.05) + dev.off() + fds <- saveFraserDataSet(fds, name="fds-normtrue") + + #all in one getEncDimRange, optimHyperParams, bestQ, fit + #fds_fraser <- FRASER(fds_filtered, q=c(jaccard=2)) + + fds <- annotateRanges(fds, GRCh = 38) + + # Pvalues + fds <- calculatePvalues(fds, type="jaccard") + # Adjust Pvalues + fds <- calculatePadjValues(fds, type="jaccard") + + fds <- saveFraserDataSet(fds, name="fds-final") + + return(fds) } write_fraser_output <- function(fraser_ds, prefix){ - # retrieve results with default and recommended cutoffs (padj <= 0.05 (0.1 gagneurlab) and # |deltaPsi| >= 0.3) - fraser_res <- results(fraser_ds, all=TRUE) - #res <- results(fds, psiType=psiTypes, aggregate=TRUE, collapse=FALSE, all=TRUE) - message(date(), ": result...") - print(fraser_res) + # retrieve results with default and recommended cutoffs (padj <= 0.05 (0.1 gagneurlab) and # |deltaPsi| >= 0.3) + fraser_res <- results(fraser_ds, all=TRUE) + message(date(), ": result...") + print(fraser_res) - write.table(as.data.frame(unname(fraser_res)), paste0("fraser_result_",prefix,".tsv"), sep="\t", quote=FALSE) + write.table(as.data.frame(unname(fraser_res)), paste0("fraser_result_",prefix,".tsv"), sep="\t", quote=FALSE) } main <- function(args){ - ##Argument.Parser not in docker image.. - sample <- args[6] #: bam + bai sample of interest - refset <- args[7] #: dir to ref set bams + bai - prefix <- args[8] #: prefix - pairedEnd <- args[9] #: pairedEnd (=TRUE) - sampleTable <- create_sample_table(sample, refset, pairedEnd) - message(date(), ": sampleTable") - fraser_ds <- run_fraser(sampleTable) - write_fraser_output(fraser_ds, prefix) + ##Argument.Parser not in docker image.. + sample <- args[6] #: bam + bai sample of interest + refset <- args[7] #: dir to ref set bams + bai + prefix <- args[8] #: prefix + pairedEnd <- args[9] #: pairedEnd (=TRUE) + sampleTable <- create_sample_table(sample, refset, pairedEnd) + fraser_ds <- run_fraser(sampleTable) + write_fraser_output(fraser_ds, prefix) } -main(args) \ No newline at end of file +main(args) diff --git a/Fraser/1.99.3/main.nf b/Fraser/1.99.3/main.nf index c63568f8..29ed828b 100644 --- a/Fraser/1.99.3/main.nf +++ b/Fraser/1.99.3/main.nf @@ -23,4 +23,4 @@ process FRASER { export BIOMART_CACHE=/hpc/diaggen/projects/woc/rna/container Rscript ${moduleDir}/fraser.R ${input} ${refset} ${prefix} TRUE """ -} \ No newline at end of file +} diff --git a/Fraser/1.99.3/meta.yml b/Fraser/1.99.3/meta.yml index 5f1ecfa0..fe11285f 100644 --- a/Fraser/1.99.3/meta.yml +++ b/Fraser/1.99.3/meta.yml @@ -22,4 +22,4 @@ output: authors: - maintainers: - - \ No newline at end of file + - diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 6797671f..11ff5ace 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -6,21 +6,23 @@ process OUTRIDER { container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - tuple val(meta), path(counts) - path(refset) + tuple val(meta), + path(counts), + path(refset), + path(genome_gtf) output: - path("*.tsv"), emit: tsv - path("*.csv"), emit: csv - path("*.pdf"), emit: pdf -// path "versions.yml" , emit: versions + path("*.tsv"), emit: tsv + path("*.csv"), emit: csv + path("*.pdf"), emit: pdf +// path "versions.yml", emit: versions when: - task.ext.when == null || task.ext.when + task.ext.when == null || task.ext.when script: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -p ${prefix} - """ -} \ No newline at end of file + def prefix = task.ext.prefix ?: "${meta.id}" + """ + Rscript ${moduleDir}/outrider.R ${counts} -r ${refset} -p ${prefix} -g ${genome_gtf} + """ +} diff --git a/Outrider/1.20.0/meta.yml b/Outrider/1.20.0/meta.yml index e856b6c9..d5d083ff 100644 --- a/Outrider/1.20.0/meta.yml +++ b/Outrider/1.20.0/meta.yml @@ -35,3 +35,4 @@ authors: - maintainers: - + diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index a7781b36..2fd1e71a 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -19,165 +19,166 @@ parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+ help = "Files or directories containing the reference input files.") parser$add_argument("-p", "--pref", metavar = "prefix", nargs = "+", help = "Prefix of file.", default="gene") +parser$add_argument("-g", "--gtf", metavar = "genome_gtf", nargs = "+", + help = "Genome gtf.") args <- parser$parse_args() read_input_files <- function(input){ - sampleIDs <- c() - count_tables <- lapply(input, function(f) { - input_ext <- tools::file_ext(f) - if(input_ext == "Rds") { - rse <- readRDS(f) # RangedSummarizedExperiment - sampleIDs <- append(sampleIDs, colnames(rse)) - return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) - } else if (input_ext %in% c("txt", "tsv")) { - count_table <- read_delim(f, show_col_types=FALSE, skip=1) - # col 1: samples IDs, col 9: counts - ct <- count_table[,c(1,9)] - names(ct)[1] <- "rownames" - sampleIDs <<- append(sampleIDs, colnames(ct)[2]) - return(ct) - } else { - stop("Input file extension is not supported.") - } - }) - return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) + sampleIDs <- c() + count_tables <- lapply(input, function(f) { + input_ext <- tools::file_ext(f) + if(input_ext == "Rds") { + rse <- readRDS(f) # RangedSummarizedExperiment + sampleIDs <- append(sampleIDs, colnames(rse)) + return(as_tibble(rownames_to_column(as.data.frame(assays(rse)$counts), var="rownames"))) + } else if (input_ext %in% c("txt", "tsv")) { + count_table <- read_delim(f, show_col_types=FALSE, skip=1) + # col 1: samples IDs, col 9: counts + ct <- count_table[,c(1,9)] + names(ct)[1] <- "rownames" + sampleIDs <<- append(sampleIDs, colnames(ct)[2]) + return(ct) + } else { + stop("Input file extension is not supported.") + } + }) + return(list("sampleIDs"=sampleIDs, "count_tables"=count_tables)) } get_input <- function(input){ - # If directories are provided - if(all(sapply(input, function(x) dir.exists(x)))) { - retrieved_files <- sapply(input, function(d){ - list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) - }) - } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. - retrieved_files <- input - } else { # If both dir and files are provided, or different tyes (character, int etc) - stop("Input is neither dir or file.") - } - - return(read_input_files(retrieved_files)) + # If directories are provided + if(all(sapply(input, function(x) dir.exists(x)))) { + retrieved_files <- sapply(input, function(d){ + list.files(path = d, pattern = "Rds|txt|tsv", full.names = TRUE, recursive = FALSE) + }) + } else if(all(sapply(input, function(x) file.exists(x)))) { # If files are provided. + retrieved_files <- input + } else { # If both dir and files are provided, or different tyes (character, int etc) + stop("Input is neither dir or file.") + } + return(read_input_files(retrieved_files)) } merge_count_tables <- function(lst_query, lst_ref){ - # merge count tables together. - lst_count_tables <- c(lst_query, lst_ref) - all_counts <- lst_count_tables %>% - Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) + # merge count tables together. + lst_count_tables <- c(lst_query, lst_ref) + all_counts <- lst_count_tables %>% + Reduce(function(dtf1,dtf2) dplyr::full_join(dtf1, dtf2,by="rownames"), .) + return(all_counts) } -filter_expression <- function(ods, query, prefix){ - if(grepl("gene", prefix)){ - ##FOR GENE LEVEL - ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile="/hpc/diaggen/data/databases/gencode/gencode.v44.primary_assembly.basic.annotation.gtf") - } - else{ - ##FOR EXON LEVEL - ct <- read_delim(query, show_col_types=FALSE, skip=1) - print("identical exon ids:") - print(identical(ct$Geneid,rownames(assays(ods)$counts))) - readD <- apply(assays(ods)$counts, 2, function(x) x / sum(x) * 10^6) - countsFPKM <- readD / ct$Length * 10^3 - perc95e <- apply(countsFPKM, 1, function(x) quantile(x,probs=0.95)) - mcols(ods)$passedFilter <- perc95e>1 - mcols(ods)$basepairs <- ct$Length - - ##FOR EXON RATIO - ##REMOVE ##FOR EXON LEVEL code above - ##UNCOMMENT BELOW STEPS -# print("EXON RATIO") -# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) - } - - # display the FPKM distribution of counts. - ods <- plotFPKM(ods) - message(date(), ": dim before filtering...") - print(dim(assays(ods)$counts)) - - ##SUBSETTING - ods <- ods[mcols(ods)$passedFilter,] - message(date(), ": dim after filtering...") - print(dim(assays(ods)$counts)) - return(ods) +filter_expression <- function(ods, query, prefix, genome_gtf){ + if(grepl("gene", prefix)){ + ##FOR GENE LEVEL + ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile=genome_gtf) + } + else{ + ##FOR EXON LEVEL + ct <- read_delim(query, show_col_types=FALSE, skip=1) + print("identical exon ids:") + print(identical(ct$Geneid,rownames(assays(ods)$counts))) + readD <- apply(assays(ods)$counts, 2, function(x) x / sum(x) * 10^6) + countsFPKM <- readD / ct$Length * 10^3 + perc95e <- apply(countsFPKM, 1, function(x) quantile(x,probs=0.95)) + mcols(ods)$passedFilter <- perc95e>1 + mcols(ods)$basepairs <- ct$Length + + ##FOR EXON RATIO + ##REMOVE ##FOR EXON LEVEL code above + ##UNCOMMENT BELOW STEPS +# print("EXON RATIO") +# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) + } + + # display the FPKM distribution of counts. + ods <- plotFPKM(ods) + message(date(), ": dim before filtering...") + print(dim(assays(ods)$counts)) + + ##SUBSETTING + ods <- ods[mcols(ods)$passedFilter,] + message(date(), ": dim after filtering...") + print(dim(assays(ods)$counts)) + return(ods) } -run_outrider <- function(all_counts, query, prefix) { - # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. - all_counts_matrix <- as.matrix(all_counts)[,-1] - mode(all_counts_matrix) <- "integer" - rownames(all_counts_matrix) <- all_counts$rownames +run_outrider <- function(all_counts, query, prefix, genome_gtf) { + # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. + all_counts_matrix <- as.matrix(all_counts)[,-1] + mode(all_counts_matrix) <- "integer" + rownames(all_counts_matrix) <- all_counts$rownames - ods <- OutriderDataSet(countData=all_counts_matrix) + ods <- OutriderDataSet(countData=all_counts_matrix) - plotheat = "counts_heatplots.pdf" - pdf(plotheat,onefile = TRUE) + plotheat = "counts_heatplots.pdf" + pdf(plotheat,onefile = TRUE) - ods <- filter_expression(ods, query, prefix) + ods <- filter_expression(ods, query, prefix, genome_gtf) - # Heatmap of the sample correlation - # it can also annotate the clusters resulting from the dendrogram - ods <- plotCountCorHeatmap(ods, normalized=FALSE) + # Heatmap of the sample correlation + # it can also annotate the clusters resulting from the dendrogram + ods <- plotCountCorHeatmap(ods, normalized=FALSE) - # Heatmap of the gene/sample expression - ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE) + # Heatmap of the gene/sample expression + ods <- plotCountGeneSampleHeatmap(ods, normalized=FALSE) - ods <- estimateSizeFactors(ods) - message(date(), ": sizeFactors...") - print(ods$sizeFactor) + ods <- estimateSizeFactors(ods) + message(date(), ": sizeFactors...") + print(ods$sizeFactor) - implementation <- "autoencoder" - pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) + pars_q <- seq(2, min(100, ncol(ods) - 1, nrow(ods) - 1), 2) - message(date(), ": findEncodingDim...") - ods <- findEncodingDim(ods, params = pars_q, implementation=implementation, BPPARAM=MulticoreParam(8)) + message(date(), ": findEncodingDim...") + ods <- findEncodingDim(ods, params = pars_q, implementation="autoencoder", BPPARAM=MulticoreParam(8)) - #find best q (dimension) - opt_q <- getBestQ(ods) - message(date(), ": getbestQ...") - print(opt_q) - - ods <- controlForConfounders(ods, q = opt_q, BPPARAM = MulticoreParam(8)) - - # After controlling for confounders the heatmap should be plotted again. - # If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* - ods <- plotCountCorHeatmap(ods, normalized=TRUE) - ods <- plotCountGeneSampleHeatmap(ods, normalized=TRUE) - dev.off() - -# if(grepl("^(peer|pca)$", implementation)){ -# message(date(), ": Fitting the data ...") # NOT with autoencoder -# ods <- fit(ods, BPPARAM=MulticoreParam(8)) -# } - - ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) - out <- computeZscores(ods) - - # run full OUTRIDER pipeline (control, fit model, calculate P-values) -# out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) - return(out) + #find best q (dimension) + opt_q <- getBestQ(ods) + message(date(), ": getbestQ...") + print(opt_q) + + ods <- controlForConfounders(ods, q = opt_q, BPPARAM = MulticoreParam(8)) + + # After controlling for confounders the heatmap should be plotted again. + # If it worked, no batches should be present and the correlations between samples should be reduced and close to zero. [1]* + ods <- plotCountCorHeatmap(ods, normalized=TRUE) + ods <- plotCountGeneSampleHeatmap(ods, normalized=TRUE) + dev.off() + +# if(grepl("^(peer|pca)$", implementation)){ +# message(date(), ": Fitting the data ...") # NOT with autoencoder +# ods <- fit(ods, BPPARAM=MulticoreParam(8)) +# } + + ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) + out <- computeZscores(ods) + + # run full OUTRIDER pipeline (control, fit model, calculate P-values) + # out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) + return(out) } ##Only necessary for Erasmus app get_ct_emcapp <- function(query, prefix){ - count_table <- read_delim(query, show_col_types=FALSE, skip=1) - ct <- count_table[,c(2,3,4,8,1)] - colnames(ct)<- c("chr","start","end","gene","geneID") - ct$chr <- str_split_fixed(ct$chr,';',2)[,1] - ct$start <- str_split_fixed(ct$start,';',2)[,1] - ct$end <- gsub("^.*;", "", ct$end) - - ##For gene level: create countdata table Identifier chr_start_end_genename - ct$emcID <- paste(ct$chr,ct$start,ct$end,ct$gene,sep="_") - ##For exon level: create countdata table Identifier: add Ensemble exon_id: chr_start_end_genename-exon_id - if(grepl("exon", prefix)){ - ct$emcID <- paste(ct$emcID,ct$geneID,sep="-") - } - - return(ct) + count_table <- read_delim(query, show_col_types=FALSE, skip=1) + ct <- count_table[,c(2,3,4,8,1)] + colnames(ct)<- c("chr","start","end","gene","geneID") + ct$chr <- str_split_fixed(ct$chr,';',2)[,1] + ct$start <- str_split_fixed(ct$start,';',2)[,1] + ct$end <- gsub("^.*;", "", ct$end) + + ##For gene level: create countdata table Identifier chr_start_end_genename + ct$emcID <- paste(ct$chr,ct$start,ct$end,ct$gene,sep="_") + ##For exon level: create countdata table Identifier: add Ensemble exon_id: chr_start_end_genename-exon_id + if(grepl("exon", prefix)){ + ct$emcID <- paste(ct$emcID,ct$geneID,sep="-") + } + + return(ct) } ##Only necessary for Erasmus app @@ -199,55 +200,57 @@ save_res_emcapp <- function(ct, res, prefix, out_path){ ##Only necessary for Erasmus app save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ - ct <- ct[,c("emcID","geneID")] - colnames(all_counts)[1]<-c("geneID") - merge_counts <- merge(all_counts, ct, by = "geneID", all.ct = TRUE) - merge_counts$geneID <- merge_counts$emcID - counts_out <- merge_counts[,c(1:ncol(merge_counts)-1)] - colnames(counts_out)[1]<-c("") - - ##countdata output table EMC app - if(grepl("gene", prefix)){ - write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_genes_counts.tsv")) - }else{ - write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_exons_counts.tsv"))} + ct <- ct[,c("emcID","geneID")] + colnames(all_counts)[1]<-c("geneID") + merge_counts <- merge(all_counts, ct, by = "geneID", all.ct = TRUE) + merge_counts$geneID <- merge_counts$emcID + counts_out <- merge_counts[,c(1:ncol(merge_counts)-1)] + colnames(counts_out)[1]<-c("") + + ##countdata output table EMC app + if(grepl("gene", prefix)){ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_genes_counts.tsv")) + }else{ + write_tsv(counts_out, paste0(out_path, "umcu_rnaseq_exons_counts.tsv")) + } - ##Meta output table EMC app - treatment <- "untreated" - if(grepl("chx|CHX",prefix)){ treatment <- "chx"} - metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") - colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") - if(grepl("gene",prefix)){ - write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_genes.csv"), row.names = FALSE, quote=FALSE) - }else{ - write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE)} + ##Meta output table EMC app + treatment <- "untreated" + if(grepl("chx|CHX",prefix)){ treatment <- "chx"} + metadata<-data.frame(colnames(counts_out[,2:ncol(counts_out)]),treatment,"fib","umcu_rnaseq",0,"") + colnames(metadata)<-c("sample_id","treatment","species","experiment_GS","gender","drop") + if(grepl("gene",prefix)){ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_genes.csv"), row.names = FALSE, quote=FALSE) + }else{ + write.csv2(metadata,paste0(out_path, "umcu_rnaseq_metadata_exons.csv"), row.names = FALSE, quote=FALSE) + } } save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { -# res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) - # Reference samples are excluded from final results. -# query_res <- filter(res, !(sampleID %in% ref_samples)) - res <- as_tibble(results(out_outrider, all=a)) - query_res <- res - - # Write output table with aberrant expressed targets. - write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) +# res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) +# # Reference samples are excluded from final results. +# query_res <- filter(res, !(sampleID %in% ref_samples)) + res <- as_tibble(results(out_outrider, all=a)) + query_res <- res + + # Write output table with aberrant expressed targets. + write_tsv(query_res, paste0(out_path, prefix, ".outrider_result.tsv")) } # TODO: investigate memory usage and if possible reduced / parallel. -main <- function(query, ref, output_path, prefix){ - query_data <- get_input(query) - ref_data <- get_input(ref) - all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) - - output <- run_outrider(all_counts, query, prefix) - save_output(output_path, output, ref_data$sampleIDs, prefix, query) - - ##Only necessary for Erasmus app, to get the counts, meta and outrider results tables - ct_emcapp <- get_ct_emcapp(query, prefix) - save_count_meta_emcapp(ct_emcapp, all_counts, output_path, prefix) - save_res_emcapp(ct_emcapp, output, prefix, output_path) +main <- function(query, ref, output_path, prefix, genome_gtf){ + query_data <- get_input(query) + ref_data <- get_input(ref) + all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) + + output <- run_outrider(all_counts, query, prefix, genome_gtf) + save_output(output_path, output, ref_data$sampleIDs, prefix, query) + + ##Only necessary for Erasmus app, to get the counts, meta and outrider results tables + ct_emcapp <- get_ct_emcapp(query, prefix) + save_count_meta_emcapp(ct_emcapp, all_counts, output_path, prefix) + save_res_emcapp(ct_emcapp, output, prefix, output_path) } -main(args$query, args$ref, args$output_path, args$pref) \ No newline at end of file +main(args$query, args$ref, args$output_path, args$pref) From 84a34b14003f186236de1ac03e778901e5d700bf Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:38:09 +0200 Subject: [PATCH 30/31] input format --- Outrider/1.20.0/main.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Outrider/1.20.0/main.nf b/Outrider/1.20.0/main.nf index 11ff5ace..283a58d3 100644 --- a/Outrider/1.20.0/main.nf +++ b/Outrider/1.20.0/main.nf @@ -6,9 +6,8 @@ process OUTRIDER { container "ghcr.io/umcugenetics/outrider_custom:0.0.1" input: - tuple val(meta), - path(counts), - path(refset), + tuple val(meta), path(counts) + path(refset) path(genome_gtf) output: From b36dbb98624943e046128ecfb931dabe0b89a676 Mon Sep 17 00:00:00 2001 From: lonnekevanbrussel <125465018+lonnekevanbrussel@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:02:18 +0200 Subject: [PATCH 31/31] working file now with gtf file as argument --- Outrider/1.20.0/outrider.R | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Outrider/1.20.0/outrider.R b/Outrider/1.20.0/outrider.R index 2fd1e71a..aa964e80 100644 --- a/Outrider/1.20.0/outrider.R +++ b/Outrider/1.20.0/outrider.R @@ -20,7 +20,7 @@ parser$add_argument("-r", "--ref", metavar = "reference_input_files", nargs = "+ parser$add_argument("-p", "--pref", metavar = "prefix", nargs = "+", help = "Prefix of file.", default="gene") parser$add_argument("-g", "--gtf", metavar = "genome_gtf", nargs = "+", - help = "Genome gtf.") + help = "Genome gtf file") args <- parser$parse_args() @@ -71,10 +71,10 @@ merge_count_tables <- function(lst_query, lst_ref){ } -filter_expression <- function(ods, query, prefix, genome_gtf){ +filter_expression <- function(ods, query, prefix, gtf){ if(grepl("gene", prefix)){ ##FOR GENE LEVEL - ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile=genome_gtf) + ods <- filterExpression(ods, fpkmCutoff = 1, minCounts = FALSE, filterGenes = FALSE, gtfFile=gtf) } else{ ##FOR EXON LEVEL @@ -90,12 +90,12 @@ filter_expression <- function(ods, query, prefix, genome_gtf){ ##FOR EXON RATIO ##REMOVE ##FOR EXON LEVEL code above ##UNCOMMENT BELOW STEPS -# print("EXON RATIO") -# ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) + # print("EXON RATIO") + # ods <- filterExpression(ods, minCounts = TRUE, filterGenes = FALSE) } # display the FPKM distribution of counts. - ods <- plotFPKM(ods) + #ods <- plotFPKM(ods) message(date(), ": dim before filtering...") print(dim(assays(ods)$counts)) @@ -107,7 +107,7 @@ filter_expression <- function(ods, query, prefix, genome_gtf){ } -run_outrider <- function(all_counts, query, prefix, genome_gtf) { +run_outrider <- function(all_counts, query, prefix, gtf) { # TODO: change to add to single object (ods) in case OOM, instead of renaming the vars. all_counts_matrix <- as.matrix(all_counts)[,-1] mode(all_counts_matrix) <- "integer" @@ -118,7 +118,7 @@ run_outrider <- function(all_counts, query, prefix, genome_gtf) { plotheat = "counts_heatplots.pdf" pdf(plotheat,onefile = TRUE) - ods <- filter_expression(ods, query, prefix, genome_gtf) + ods <- filter_expression(ods, query, prefix, gtf) # Heatmap of the sample correlation # it can also annotate the clusters resulting from the dendrogram @@ -150,15 +150,15 @@ run_outrider <- function(all_counts, query, prefix, genome_gtf) { dev.off() # if(grepl("^(peer|pca)$", implementation)){ -# message(date(), ": Fitting the data ...") # NOT with autoencoder -# ods <- fit(ods, BPPARAM=MulticoreParam(8)) +# message(date(), ": Fitting the data ...") # NOT with autoencoder +# ods <- fit(ods, BPPARAM=MulticoreParam(8)) # } ods <- computePvalues(ods, alternative="two.sided", method="BY", BPPARAM = MulticoreParam(8)) out <- computeZscores(ods) - # run full OUTRIDER pipeline (control, fit model, calculate P-values) - # out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) +# run full OUTRIDER pipeline (control, fit model, calculate P-values) +# out <- OUTRIDER(ods, BPPARAM=MulticoreParam(8)) return(out) } @@ -228,9 +228,9 @@ save_count_meta_emcapp <- function(ct, all_counts, out_path, prefix){ save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj_thres=0.05, zscore_thres=0, a=TRUE) { # res <- as_tibble(results(out_outrider, padjCutoff=padj_thres, zScoreCutoff=zscore_thres, all=a)) -# # Reference samples are excluded from final results. + ##Reference samples are excluded from final results. # query_res <- filter(res, !(sampleID %in% ref_samples)) - res <- as_tibble(results(out_outrider, all=a)) + res <- as_tibble(results(out_outrider, all=a)) query_res <- res # Write output table with aberrant expressed targets. @@ -238,12 +238,12 @@ save_output <- function(out_path, out_outrider, ref_samples, prefix, query, padj } # TODO: investigate memory usage and if possible reduced / parallel. -main <- function(query, ref, output_path, prefix, genome_gtf){ +main <- function(query, ref, output_path, prefix, gtf){ query_data <- get_input(query) ref_data <- get_input(ref) all_counts <- merge_count_tables(query_data$count_tables, ref_data$count_tables) - output <- run_outrider(all_counts, query, prefix, genome_gtf) + output <- run_outrider(all_counts, query, prefix, gtf) save_output(output_path, output, ref_data$sampleIDs, prefix, query) ##Only necessary for Erasmus app, to get the counts, meta and outrider results tables @@ -253,4 +253,4 @@ main <- function(query, ref, output_path, prefix, genome_gtf){ } -main(args$query, args$ref, args$output_path, args$pref) +main(args$query, args$ref, args$output_path, args$pref, args$gtf) \ No newline at end of file